From 1e80bbb5d33a18bf0665ee6d540d3b4097f5d34b Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Thu, 7 Aug 2025 00:16:27 +0200 Subject: [PATCH 01/67] feat: implemented stage FQN generation for pipeline parallelism --- src/modalities/models/parallelism/__init__.py | 0 .../parallelism/pipeline_parallelism.py | 88 +++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 src/modalities/models/parallelism/__init__.py create mode 100644 src/modalities/models/parallelism/pipeline_parallelism.py diff --git a/src/modalities/models/parallelism/__init__.py b/src/modalities/models/parallelism/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/modalities/models/parallelism/pipeline_parallelism.py b/src/modalities/models/parallelism/pipeline_parallelism.py new file mode 100644 index 000000000..e1d2233ba --- /dev/null +++ b/src/modalities/models/parallelism/pipeline_parallelism.py @@ -0,0 +1,88 @@ +# Some portions of this implementation are inspired and/or adapted +# from Meta's open-source project TorchTitan, +# licensed under the BSD 3-Clause License. + +import math +from abc import ABC, abstractmethod +from typing import Optional + +import torch +from torch.distributed.device_mesh import DeviceMesh +from torch.distributed.pipelining.schedules import PipelineScheduleSingle, get_schedule_class + +from modalities.running_env.fsdp.device_mesh import ParallelismDegrees + + +class FQNsPerStageGenerator(ABC): + @abstractmethod + def generate_fqns_per_stage( + self, num_stages: int, num_layers: int, input_layer_equivalence: int = 1, output_layer_equivalence: int = 1 + ) -> list[list[str]]: + """ + Generate a list of fully qualified names (FQNs) for each pipeline stage. + + Args: + num_stages (int): Number of stages in the pipeline. + num_layers (int): Total number of layers in the model. + input_layer_equivalence (int): Determines to how many transformer layers + the input layer corresponds. Default is 1. + output_layer_equivalence (int): Determines to how many transformer layers + the output layer corresponds. Default is 1. + + Returns: + list[list[str]]: A list containing an FQN list for each stage. + """ + raise NotImplementedError("This method should be implemented by subclasses.") + + +class PipelineFactory: + """Pipeline factory class to create pipelined models.""" + + @staticmethod + def create_pipeline_model( + num_layers: int, + fqns_per_stage_generator: FQNsPerStageGenerator, + device_mesh: DeviceMesh, + pp_schedule_name: str, + num_layers_per_stage: int, + input_layer_equivalence: Optional[int] = 1, + output_layer_equivalence: Optional[int] = 1, + ) -> torch.nn.Module: + device_mesh[ParallelismDegrees.PP.value] + pp_dims = device_mesh.size(ParallelismDegrees.PP.value) + schedule_class = get_schedule_class(pp_schedule_name) + is_single_stage_schedule = issubclass(schedule_class, PipelineScheduleSingle) + if not is_single_stage_schedule: + raise ValueError( + f"Unsupported pipeline schedule: {pp_schedule_name}. We only support single-stage schedules." + ) + + # calculate the number of stages + num_virtual_stages = math.ceil( + (num_layers + input_layer_equivalence + output_layer_equivalence) / num_layers_per_stage + ) + if num_virtual_stages % pp_dims != 0: + raise ValueError( + f"Number of virtual stages {num_virtual_stages} is not divisible by parallel dimensions {pp_dims}. " + f"For reference: {num_layers=} {input_layer_equivalence=} " + f"{output_layer_equivalence=} {num_layers_per_stage=}" + ) + + stages_per_rank = num_virtual_stages // pp_dims + if stages_per_rank != 1: + raise ValueError( + f"Stages per rank {stages_per_rank} must be 1 for single-stage schedules. " + f"Please adjust {num_layers_per_stage=} to ensure each PP rank has exactly one stage." + ) + + fqns_per_stage_generator.generate_fqns_per_stage( + num_stages=num_virtual_stages, + num_layers=num_layers, + input_layer_equivalence=input_layer_equivalence, + output_layer_equivalence=output_layer_equivalence, + ) + + @staticmethod + def create_gpt2_model_splitter(): + """Create a GPT-2 model splitter for pipeline parallelism.""" + pass From ed93d284d108a1008b0eb7dae102f3b4a98f4c46 Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Thu, 7 Aug 2025 15:49:13 +0200 Subject: [PATCH 02/67] feat: added FQNs per stage calculation --- .../parallelism/pipeline_parallelism.py | 91 +++++++++++++++++-- 1 file changed, 83 insertions(+), 8 deletions(-) diff --git a/src/modalities/models/parallelism/pipeline_parallelism.py b/src/modalities/models/parallelism/pipeline_parallelism.py index e1d2233ba..ac6be437c 100644 --- a/src/modalities/models/parallelism/pipeline_parallelism.py +++ b/src/modalities/models/parallelism/pipeline_parallelism.py @@ -1,4 +1,4 @@ -# Some portions of this implementation are inspired and/or adapted +# Some portions of this implementation are inspired, adapted, or refactored # from Meta's open-source project TorchTitan, # licensed under the BSD 3-Clause License. @@ -14,27 +14,102 @@ class FQNsPerStageGenerator(ABC): - @abstractmethod def generate_fqns_per_stage( self, num_stages: int, num_layers: int, input_layer_equivalence: int = 1, output_layer_equivalence: int = 1 ) -> list[list[str]]: """ - Generate a list of fully qualified names (FQNs) for each pipeline stage. + Generate FQNs for each stage in a GPT-2 model. Args: num_stages (int): Number of stages in the pipeline. num_layers (int): Total number of layers in the model. - input_layer_equivalence (int): Determines to how many transformer layers - the input layer corresponds. Default is 1. - output_layer_equivalence (int): Determines to how many transformer layers - the output layer corresponds. Default is 1. + input_layer_equivalence (int): Number of layers corresponding to the input layer. + output_layer_equivalence (int): Number of layers corresponding to the output layer. Returns: - list[list[str]]: A list containing an FQN list for each stage. + list[list[str]]: A list containing FQNs for each stage. + """ + + # Potential split points for GPT-2 model with each potential split point + # listing the FQNs of the modules in that stage and the computational weight. + # The computational weight of the input and output modules are estimated + # based on the number of layers they correspond to. + potential_split_points = self._get_potential_split_points( + num_layers=num_layers, + input_layer_equivalence=input_layer_equivalence, + output_layer_equivalence=output_layer_equivalence, + ) + # Calculate the weight per stage based on the total weight and number of stages + weight_per_stage = math.ceil(sum(weight for _, weight in potential_split_points) / num_stages) + # pack the stages with the layers + next_split_point = 0 + module_names_per_stage: list[list[str]] = [] + for _ in range(num_stages): + stage_fqns = [] + stage_weight = 0 + while next_split_point < len(potential_split_points): + fqns, weight = potential_split_points[next_split_point] + if weight > weight_per_stage: + raise ValueError( + f"Weight of {weight} for {fqns} exceeds weight per stage {weight_per_stage}. " + "Please adjust the number of stages or the weight distribution." + ) + if stage_weight + weight > weight_per_stage: + break + stage_fqns.extend(fqns) + stage_weight += weight + next_split_point += 1 + module_names_per_stage.append(stage_fqns) + + return module_names_per_stage + + @abstractmethod + def _get_potential_split_points( + self, num_layers: int, input_layer_equivalence: int = 1, output_layer_equivalence: int = 1 + ) -> list[tuple[list[str], int]]: + """ + Returns a list of potential split points for the GPT-2 model. + + Args: + num_layers (int): Total number of layers in the model. + input_layer_equivalence (int): Number of layers corresponding to the input layer. + output_layer_equivalence (int): Number of layers corresponding to the output layer. + + Returns: + list[tuple[list[str], int]]: A list containing tuples of FQNs and their computational weights. """ raise NotImplementedError("This method should be implemented by subclasses.") +class GPT2LLMFQNsPerStageGenerator(FQNsPerStageGenerator): + def _get_potential_split_points( + self, num_layers: int, input_layer_equivalence: int = 1, output_layer_equivalence: int = 1 + ) -> list[tuple[list[str], int]]: + """ + Returns a list of potential split points for the GPT-2 model. + + Args: + num_layers (int): Total number of layers in the model. + input_layer_equivalence (int): Number of layers corresponding to the input layer. + output_layer_equivalence (int): Number of layers corresponding to the output layer. + + Returns: + list[tuple[list[str], int]]: A list containing tuples of FQNs and their computational weights. + """ + + # Potential split points for GPT-2 model with each potential split point + # listing the FQNs of the modules in that stage and the computational weight. + # The computational weight of the input and output modules are estimated + # based on the number of layers they correspond to. + potential_split_points = [ + (["transformer.wte", "transformer.wpe"], input_layer_equivalence), + *[([f"transformer.h.{i}"], 1) for i in range(num_layers)], + (["transformer.lm_head_norm", "transformer.lm_head"], output_layer_equivalence), + ] + + return potential_split_points + + class PipelineFactory: """Pipeline factory class to create pipelined models.""" From 6241ea8dbfef66ac7b89e07a0ee350ddb37306a5 Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Fri, 15 Aug 2025 15:31:34 +0200 Subject: [PATCH 03/67] feat: generic FQN-based PP staging --- .../parallelism/pipeline_parallelism.py | 279 +++++++++--------- 1 file changed, 145 insertions(+), 134 deletions(-) diff --git a/src/modalities/models/parallelism/pipeline_parallelism.py b/src/modalities/models/parallelism/pipeline_parallelism.py index ac6be437c..0af1551af 100644 --- a/src/modalities/models/parallelism/pipeline_parallelism.py +++ b/src/modalities/models/parallelism/pipeline_parallelism.py @@ -2,162 +2,173 @@ # from Meta's open-source project TorchTitan, # licensed under the BSD 3-Clause License. -import math -from abc import ABC, abstractmethod -from typing import Optional +import copy +from typing import Any, Optional, Type import torch +import torch.nn as nn from torch.distributed.device_mesh import DeviceMesh +from torch.distributed.pipelining import PipelineStage from torch.distributed.pipelining.schedules import PipelineScheduleSingle, get_schedule_class +from modalities.models.parallelism.stages_generator import StagesGenerator from modalities.running_env.fsdp.device_mesh import ParallelismDegrees -class FQNsPerStageGenerator(ABC): - def generate_fqns_per_stage( - self, num_stages: int, num_layers: int, input_layer_equivalence: int = 1, output_layer_equivalence: int = 1 - ) -> list[list[str]]: - """ - Generate FQNs for each stage in a GPT-2 model. - - Args: - num_stages (int): Number of stages in the pipeline. - num_layers (int): Total number of layers in the model. - input_layer_equivalence (int): Number of layers corresponding to the input layer. - output_layer_equivalence (int): Number of layers corresponding to the output layer. - - Returns: - list[list[str]]: A list containing FQNs for each stage. - """ - - # Potential split points for GPT-2 model with each potential split point - # listing the FQNs of the modules in that stage and the computational weight. - # The computational weight of the input and output modules are estimated - # based on the number of layers they correspond to. - potential_split_points = self._get_potential_split_points( - num_layers=num_layers, - input_layer_equivalence=input_layer_equivalence, - output_layer_equivalence=output_layer_equivalence, - ) - # Calculate the weight per stage based on the total weight and number of stages - weight_per_stage = math.ceil(sum(weight for _, weight in potential_split_points) / num_stages) - # pack the stages with the layers - next_split_point = 0 - module_names_per_stage: list[list[str]] = [] - for _ in range(num_stages): - stage_fqns = [] - stage_weight = 0 - while next_split_point < len(potential_split_points): - fqns, weight = potential_split_points[next_split_point] - if weight > weight_per_stage: - raise ValueError( - f"Weight of {weight} for {fqns} exceeds weight per stage {weight_per_stage}. " - "Please adjust the number of stages or the weight distribution." - ) - if stage_weight + weight > weight_per_stage: - break - stage_fqns.extend(fqns) - stage_weight += weight - next_split_point += 1 - module_names_per_stage.append(stage_fqns) - - return module_names_per_stage - - @abstractmethod - def _get_potential_split_points( - self, num_layers: int, input_layer_equivalence: int = 1, output_layer_equivalence: int = 1 - ) -> list[tuple[list[str], int]]: - """ - Returns a list of potential split points for the GPT-2 model. - - Args: - num_layers (int): Total number of layers in the model. - input_layer_equivalence (int): Number of layers corresponding to the input layer. - output_layer_equivalence (int): Number of layers corresponding to the output layer. - - Returns: - list[tuple[list[str], int]]: A list containing tuples of FQNs and their computational weights. - """ - raise NotImplementedError("This method should be implemented by subclasses.") - - -class GPT2LLMFQNsPerStageGenerator(FQNsPerStageGenerator): - def _get_potential_split_points( - self, num_layers: int, input_layer_equivalence: int = 1, output_layer_equivalence: int = 1 - ) -> list[tuple[list[str], int]]: - """ - Returns a list of potential split points for the GPT-2 model. - - Args: - num_layers (int): Total number of layers in the model. - input_layer_equivalence (int): Number of layers corresponding to the input layer. - output_layer_equivalence (int): Number of layers corresponding to the output layer. - - Returns: - list[tuple[list[str], int]]: A list containing tuples of FQNs and their computational weights. - """ - - # Potential split points for GPT-2 model with each potential split point - # listing the FQNs of the modules in that stage and the computational weight. - # The computational weight of the input and output modules are estimated - # based on the number of layers they correspond to. - potential_split_points = [ - (["transformer.wte", "transformer.wpe"], input_layer_equivalence), - *[([f"transformer.h.{i}"], 1) for i in range(num_layers)], - (["transformer.lm_head_norm", "transformer.lm_head"], output_layer_equivalence), - ] - - return potential_split_points - - class PipelineFactory: """Pipeline factory class to create pipelined models.""" @staticmethod - def create_pipeline_model( - num_layers: int, - fqns_per_stage_generator: FQNsPerStageGenerator, + def get_pipelined_model( + whole_model: nn.Module, + stages_generator: StagesGenerator, device_mesh: DeviceMesh, + local_rank: int, pp_schedule_name: str, num_layers_per_stage: int, - input_layer_equivalence: Optional[int] = 1, - output_layer_equivalence: Optional[int] = 1, ) -> torch.nn.Module: - device_mesh[ParallelismDegrees.PP.value] - pp_dims = device_mesh.size(ParallelismDegrees.PP.value) + device = torch.device("cuda", local_rank) + pp_dims = device_mesh[ParallelismDegrees.PP.value].size() + + fqns_per_stage = stages_generator.get_stages( + num_layers_per_stage=num_layers_per_stage, + pp_dims=pp_dims, + ) + + pp_mesh = device_mesh[ParallelismDegrees.PP.value] schedule_class = get_schedule_class(pp_schedule_name) is_single_stage_schedule = issubclass(schedule_class, PipelineScheduleSingle) if not is_single_stage_schedule: raise ValueError( f"Unsupported pipeline schedule: {pp_schedule_name}. We only support single-stage schedules." ) - - # calculate the number of stages - num_virtual_stages = math.ceil( - (num_layers + input_layer_equivalence + output_layer_equivalence) / num_layers_per_stage - ) - if num_virtual_stages % pp_dims != 0: - raise ValueError( - f"Number of virtual stages {num_virtual_stages} is not divisible by parallel dimensions {pp_dims}. " - f"For reference: {num_layers=} {input_layer_equivalence=} " - f"{output_layer_equivalence=} {num_layers_per_stage=}" - ) - - stages_per_rank = num_virtual_stages // pp_dims - if stages_per_rank != 1: - raise ValueError( - f"Stages per rank {stages_per_rank} must be 1 for single-stage schedules. " - f"Please adjust {num_layers_per_stage=} to ensure each PP rank has exactly one stage." - ) - - fqns_per_stage_generator.generate_fqns_per_stage( - num_stages=num_virtual_stages, - num_layers=num_layers, - input_layer_equivalence=input_layer_equivalence, - output_layer_equivalence=output_layer_equivalence, + stage, model = PipelineFactory._get_split_model( + whole_model=whole_model, + schedule_class=schedule_class, + pp_mesh=pp_mesh, + device=device, + fqns_per_stage=fqns_per_stage, ) + return whole_model # TODO return pipelined model @staticmethod - def create_gpt2_model_splitter(): - """Create a GPT-2 model splitter for pipeline parallelism.""" - pass + def _get_split_model( + whole_model: nn.Module, + schedule_class: Type[PipelineScheduleSingle], + pp_mesh: DeviceMesh, + device: torch.device, + fqns_per_stage: list[list[str]], + ) -> tuple[PipelineStage, nn.Module]: + def get_stage_id_of_pp_rank(pp_mesh: DeviceMesh): + # NOTE: torch titan a more complicated way to get the stage id of pp rank + # since they also allow for multi-stage schedules + pp_rank = pp_mesh.get_local_rank() + return pp_rank + + @staticmethod + def _get_fqn_tree(fqns: list[str]) -> dict[str, Any]: + fqn_tree = {} + fqns = set(fqns) # Ensure unique FQNs + for fqn in fqns: + parts = fqn.split(".") + current_level = fqn_tree + for part in parts[:-1]: + if part not in current_level: + current_level[part] = {} + elif len(current_level) == 0: + raise ValueError(f"Part {part} of {fqn} already exists " "in the tree as a leaf node.") + current_level = current_level[part] + if parts[-1] in current_level: + raise ValueError( + f" Leaf of {fqn} has already been defined in the tree as an intermediadate node or leaf! " + "Cannot replace the existing node as a leaf." + ) + current_level[parts[-1]] = {} + + return fqn_tree + + def _build_stage_from_modules( + fqn_tree: dict[str, Any], module: nn.Module, module_name: Optional[str] = None + ) -> tuple[PipelineStage, nn.Module]: + if isinstance(module, nn.ModuleDict): + if module_name not in fqn_tree: + dict_modules = nn.ModuleDict({}) + else: + if len(fqn_tree) == 0: + # If the module is a leaf node, we can directly use it + dict_modules = module + else: + # If the module is not a leaf node, we need to build a staged module + # recursively from the FQN tree + dict_modules = {} + dict_module_names = [name for name in module.keys() if name in fqn_tree[module_name]] + for dict_module_name in dict_module_names: + dict_modules[dict_module_name] = _build_stage_from_modules( + fqn_tree=fqn_tree[module_name], + module=module[dict_module_name], + module_name=dict_module_name, + ) + dict_modules = nn.ModuleDict(dict_modules) + # setattr(module, module_name, dict_modules) + return dict_modules + + elif isinstance(module, nn.ModuleList): + if module_name not in fqn_tree: + list_modules = nn.ModuleList([]) + else: + if len(fqn_tree[module_name]) == 0: + # If the module is a leaf node, we can directly use it + list_modules = module + else: + # If the module is not a leaf node, we need to build a staged module + # recursively from the FQN tree + list_modules = [] + list_indices = [i for i in range(len(module)) if str(i) in fqn_tree[module_name]] + for idx in list_indices: + list_modules.append( + _build_stage_from_modules( + fqn_tree=fqn_tree[module_name], module=module[idx], module_name=str(idx) + ) + ) + list_modules = nn.ModuleList(list_modules) + # setattr(module, module_name, list_modules) + return list_modules + + else: # normal nn.Module + if module_name is not None and module_name not in fqn_tree: + # If the module is not in the FQN tree, set it to None + return None + elif module_name is not None and len(fqn_tree[module_name]) == 0: + # If the module is a leaf node, we can directly use it + return module + else: + # If the module is in the FQN tree, we need to build a staged module + # recursively from the FQN tree + for module_name, module_value in module.named_children(): + # If the module is not a leaf node, we need to build a staged module + # recursively from the FQN tree + staged_module = _build_stage_from_modules( + fqn_tree=fqn_tree, module=module_value, module_name=module_name + ) + setattr(module, module_name, staged_module) + + return module + + if not issubclass(schedule_class, PipelineScheduleSingle): + raise NotImplementedError("Only single-stage schedules are supported for pipeline parallelism.") + + # NOTE: For multi-stage schedule, e.g., Interleaved 1F1B, we have multiple stages per pp rank. + # This would need to be adapted accordingly in this case. + stage_idx = get_stage_id_of_pp_rank(pp_mesh) + module_names = fqns_per_stage[stage_idx] + whole_model = copy.deepcopy(whole_model) + fqn_tree = _get_fqn_tree(module_names) + stage_modules = _build_stage_from_modules(fqn_tree, whole_model) + stage = PipelineStage( + submodule=stage_modules, + stage_index=stage_idx, + num_stages=len(fqns_per_stage), + device=device, + group=pp_mesh.get_group("pp"), + ) + return stage, whole_model From 0ba8fbc4e0e8d20623d32844c90725a6beea3d09 Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Fri, 15 Aug 2025 15:32:08 +0200 Subject: [PATCH 04/67] feat: added PP configs --- .../pipeline_parallelism_configs.py | 22 +++++++++++++++++++ .../parallelism/stages_generator_configs.py | 13 +++++++++++ 2 files changed, 35 insertions(+) create mode 100644 src/modalities/models/parallelism/pipeline_parallelism_configs.py create mode 100644 src/modalities/models/parallelism/stages_generator_configs.py diff --git a/src/modalities/models/parallelism/pipeline_parallelism_configs.py b/src/modalities/models/parallelism/pipeline_parallelism_configs.py new file mode 100644 index 000000000..61b8b5ba4 --- /dev/null +++ b/src/modalities/models/parallelism/pipeline_parallelism_configs.py @@ -0,0 +1,22 @@ +from typing import Annotated + +from pydantic import BaseModel, Field + +from modalities.config.pydantic_if_types import ( + PydanticDeviceMeshIFType, + PydanticPytorchModuleType, + PydanticStagesGeneratorType, +) + + +class FQNsPerStageGeneratorConfig(BaseModel): + pass + + +class PipelinedModelConfig(BaseModel): + whole_model: PydanticPytorchModuleType + stages_generator: PydanticStagesGeneratorType + device_mesh: PydanticDeviceMeshIFType + local_rank: Annotated[int, Field(strict=True, ge=0)] + pp_schedule_name: str + num_layers_per_stage: Annotated[int, Field(strict=True, ge=1)] diff --git a/src/modalities/models/parallelism/stages_generator_configs.py b/src/modalities/models/parallelism/stages_generator_configs.py new file mode 100644 index 000000000..610be7fdd --- /dev/null +++ b/src/modalities/models/parallelism/stages_generator_configs.py @@ -0,0 +1,13 @@ +from typing import Annotated + +from pydantic import BaseModel, Field + + +class FQNsPerStageGeneratorConfig(BaseModel): + pass + + +class GPT2LLMStagesGeneratorConfig(BaseModel): + num_model_layers: Annotated[int, Field(strict=True, ge=1)] + input_layer_equivalence: Annotated[int, Field(strict=True, ge=1)] = 1 + output_layer_equivalence: Annotated[int, Field(strict=True, ge=1)] = 1 From 4a41b6c3648a56a52af86af1a30de34c2215644c Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Fri, 15 Aug 2025 15:32:55 +0200 Subject: [PATCH 05/67] feat: wired up PP within dependency graph --- src/modalities/config/pydantic_if_types.py | 2 ++ src/modalities/registry/components.py | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/modalities/config/pydantic_if_types.py b/src/modalities/config/pydantic_if_types.py index aa12a444d..eb7d0bce1 100644 --- a/src/modalities/config/pydantic_if_types.py +++ b/src/modalities/config/pydantic_if_types.py @@ -21,6 +21,7 @@ from modalities.inference.text.inference_component import TextInferenceComponent from modalities.logging_broker.subscriber import MessageSubscriberIF from modalities.loss_functions import Loss +from modalities.models.parallelism.pipeline_parallelism import StagesGenerator from modalities.nn.model_initialization.initialization_if import ModelInitializationIF from modalities.tokenization.tokenizer_wrapper import TokenizerWrapper from modalities.training.gradient_clipping.gradient_clipper import GradientClipperIF @@ -83,3 +84,4 @@ def __get_pydantic_core_schema__( PydanticDatasetBatchGeneratorIFType = Annotated[ DatasetBatchGeneratorIF, PydanticThirdPartyTypeIF(DatasetBatchGeneratorIF) ] +PydanticStagesGeneratorType = Annotated[StagesGenerator, PydanticThirdPartyTypeIF(StagesGenerator)] diff --git a/src/modalities/registry/components.py b/src/modalities/registry/components.py index 28afab4bb..e6da12819 100644 --- a/src/modalities/registry/components.py +++ b/src/modalities/registry/components.py @@ -86,6 +86,10 @@ from modalities.models.gpt2.gpt2_model import GPT2LLMConfig from modalities.models.huggingface.huggingface_model import HuggingFacePretrainedModel, HuggingFacePretrainedModelConfig from modalities.models.model_factory import GPT2ModelFactory, ModelFactory +from modalities.models.parallelism.pipeline_parallelism import PipelineFactory +from modalities.models.parallelism.pipeline_parallelism_configs import PipelinedModelConfig +from modalities.models.parallelism.stages_generator import GPT2LLMStagesGenerator +from modalities.models.parallelism.stages_generator_configs import GPT2LLMStagesGeneratorConfig from modalities.nn.model_initialization.composed_initialization import ( ComposedInitializationRoutines, ComposedModelInitializationConfig, @@ -174,6 +178,9 @@ class ComponentEntity: ComponentEntity( "model", "debugging_enriched", ModelFactory.get_debugging_enriched_model, DebuggingEnrichedModelConfig ), + ComponentEntity("model", "pipelined", PipelineFactory.get_pipelined_model, PipelinedModelConfig), + # Pipeline Stages Generators + ComponentEntity("stages_generator", "gpt2_stages_generator", GPT2LLMStagesGenerator, GPT2LLMStagesGeneratorConfig), # Device mesh ComponentEntity("device_mesh", "default", get_device_mesh, DeviceMeshConfig), # weight initializers @@ -209,7 +216,6 @@ class ComponentEntity: # tokenizers ComponentEntity("tokenizer", "pretrained_hf_tokenizer", PreTrainedHFTokenizer, PreTrainedHFTokenizerConfig), ComponentEntity("tokenizer", "pretrained_sp_tokenizer", PreTrainedSPTokenizer, PreTrainedSPTokenizerConfig), - # ComponentEntity("tokenizer", "llama_tokenizer_fast", GPT2TokenizerFast, None), # TODO # datasets ComponentEntity("dataset", "mem_map_dataset", DatasetFactory.get_mem_map_dataset, MemMapDatasetConfig), ComponentEntity( From ee529b746da46c0eacba9b771c9a39db70152432 Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Fri, 15 Aug 2025 15:33:34 +0200 Subject: [PATCH 06/67] feat: added FQN stages generator --- .../models/parallelism/stages_generator.py | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 src/modalities/models/parallelism/stages_generator.py diff --git a/src/modalities/models/parallelism/stages_generator.py b/src/modalities/models/parallelism/stages_generator.py new file mode 100644 index 000000000..0a212672a --- /dev/null +++ b/src/modalities/models/parallelism/stages_generator.py @@ -0,0 +1,120 @@ +# Some portions of this implementation are inspired, adapted, or refactored +# from Meta's open-source project TorchTitan, +# licensed under the BSD 3-Clause License. + +import math +from abc import ABC, abstractmethod + + +class StagesGenerator(ABC): + def __init__(self, num_model_layers: int, input_layer_equivalence: int = 1, output_layer_equivalence: int = 1): + self._num_model_layers = num_model_layers + self._input_layer_equivalence = input_layer_equivalence + self._output_layer_equivalence = output_layer_equivalence + + def get_stages(self, num_layers_per_stage: int, pp_dims: int) -> list[list[str]]: + """ + Generate FQNs for each stage in a GPT-2 model. + + Args: + num_layers_per_stage (int): Number of layers per stage. + pp_dims (int): Number of pipeline parallel dimensions. + + Returns: + list[list[str]]: A list containing FQNs for each stage. + """ + + # calculate the number of stages + num_virtual_stages = math.ceil( + (self._num_model_layers + self._input_layer_equivalence + self._output_layer_equivalence) + / num_layers_per_stage + ) + if num_virtual_stages % pp_dims != 0: + raise ValueError( + f"Number of virtual stages {num_virtual_stages} is not divisible by parallel dimensions {pp_dims}. " + f"For reference: {self._num_model_layers=} {self._input_layer_equivalence=} " + f"{self._output_layer_equivalence=} {num_layers_per_stage=}" + ) + + stages_per_rank = num_virtual_stages // pp_dims + if stages_per_rank != 1: + raise ValueError( + f"Stages per rank {stages_per_rank} must be 1 for single-stage schedules. " + f"Please adjust {num_layers_per_stage=} to ensure each PP rank has exactly one stage." + ) + + # Potential split points for GPT-2 model with each potential split point + # listing the FQNs of the modules in that stage and the computational weight. + # The computational weight of the input and output modules are estimated + # based on the number of layers they correspond to. + potential_split_points = self._get_potential_split_points() + # Calculate the weight per stage based on the total weight and number of stages + weight_per_stage = math.ceil(sum(weight for _, weight in potential_split_points) / num_virtual_stages) + # pack the stages with the layers + next_split_point = 0 + module_names_per_stage: list[list[str]] = [] + for _ in range(num_virtual_stages): + stage_fqns = [] + stage_weight = 0 + while next_split_point < len(potential_split_points): + fqns, weight = potential_split_points[next_split_point] + if weight > weight_per_stage: + raise ValueError( + f"Weight of {weight} for {fqns} exceeds weight per stage {weight_per_stage}. " + "Please adjust the number of stages or the weight distribution." + ) + if stage_weight + weight > weight_per_stage: + break + stage_fqns.extend(fqns) + stage_weight += weight + next_split_point += 1 + module_names_per_stage.append(stage_fqns) + + return module_names_per_stage + + @abstractmethod + def _get_potential_split_points(self) -> list[tuple[list[str], int]]: + """ + Returns a list of potential split points for the GPT-2 model. + + Args: + num_model_layers (int): Total number of layers in the model. + input_layer_equivalence (int): Number of layers corresponding to the input layer. + output_layer_equivalence (int): Number of layers corresponding to the output layer. + + Returns: + list[tuple[list[str], int]]: A list containing tuples of FQNs and their computational weights. + """ + raise NotImplementedError("This method should be implemented by subclasses.") + + +class GPT2LLMStagesGenerator(StagesGenerator): + def __init__(self, num_model_layers: int, input_layer_equivalence: int = 1, output_layer_equivalence: int = 1): + super().__init__(num_model_layers, input_layer_equivalence, output_layer_equivalence) + + def _get_potential_split_points( + self, + ) -> list[tuple[list[str], int]]: + """ + Returns a list of potential split points for the GPT-2 model. + + Args: + num_model_layers (int): Total number of layers in the model. + input_layer_equivalence (int): Number of layers corresponding to the input layer. + output_layer_equivalence (int): Number of layers corresponding to the output layer. + + Returns: + list[tuple[list[str], int]]: A list containing tuples of FQNs and their computational weights. + """ + + # Potential split points for GPT-2 model with each potential split point + # listing the FQNs of the modules in that stage and the computational weight. + # The computational weight of the input and output modules are estimated + # based on the number of layers they correspond to. + potential_split_points = [ + (["transformer.wte", "transformer.wpe", "transformer.drop"], self._input_layer_equivalence), + *[([f"transformer.h.{i}"], 1) for i in range(self._num_model_layers)], + (["transformer.lm_head_norm", "transformer.lm_head"], self._output_layer_equivalence), + ] + + return potential_split_points From 625de592c02572db7626168d8504118909b768e1 Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Mon, 18 Aug 2025 23:47:00 +0200 Subject: [PATCH 07/67] feat: implemented scheduled pipeline --- .../parallelism/pipeline_parallelism.py | 84 ++++++++++++++++++- 1 file changed, 80 insertions(+), 4 deletions(-) diff --git a/src/modalities/models/parallelism/pipeline_parallelism.py b/src/modalities/models/parallelism/pipeline_parallelism.py index 0af1551af..e9ac0c755 100644 --- a/src/modalities/models/parallelism/pipeline_parallelism.py +++ b/src/modalities/models/parallelism/pipeline_parallelism.py @@ -3,6 +3,7 @@ # licensed under the BSD 3-Clause License. import copy +from enum import Enum from typing import Any, Optional, Type import torch @@ -11,22 +12,72 @@ from torch.distributed.pipelining import PipelineStage from torch.distributed.pipelining.schedules import PipelineScheduleSingle, get_schedule_class +from modalities.loss_functions import Loss from modalities.models.parallelism.stages_generator import StagesGenerator from modalities.running_env.fsdp.device_mesh import ParallelismDegrees +from modalities.utils.logger_utils import get_logger + +logger = get_logger(__name__) + + +class Pipeline: + def __init__( + self, + stage: PipelineStage, + model: nn.Module, + schedule: Optional[PipelineScheduleSingle] = None, + ): + self._stage = stage + self._model = model + self._schedule = schedule + + @property + def is_first_stage(self) -> bool: + return self._stage.is_first + + @property + def is_last_stage(self) -> bool: + return self._stage.is_last + + @property.setter + def schedule(self, schedule: PipelineScheduleSingle): + self._schedule = schedule + + +class PipelineSelectionTypes(Enum): + """Enum for pipeline selection types.""" + + STAGE = "stage" + MODEL = "model" + SCHEDULE = "schedule" + + +class ComponentSelectorFromPipeline: + @staticmethod + def select(pipeline: Pipeline, selection_type: PipelineSelectionTypes) -> Any: + """Selects a component from the pipeline based on the selection type.""" + if selection_type == PipelineSelectionTypes.STAGE: + return pipeline._stage + elif selection_type == PipelineSelectionTypes.MODEL: + return pipeline._model + elif selection_type == PipelineSelectionTypes.SCHEDULE: + return pipeline._schedule + else: + raise ValueError(f"Unsupported selection type: {selection_type}") class PipelineFactory: """Pipeline factory class to create pipelined models.""" @staticmethod - def get_pipelined_model( + def get_staged_pipeline( whole_model: nn.Module, stages_generator: StagesGenerator, device_mesh: DeviceMesh, local_rank: int, pp_schedule_name: str, num_layers_per_stage: int, - ) -> torch.nn.Module: + ) -> Pipeline: device = torch.device("cuda", local_rank) pp_dims = device_mesh[ParallelismDegrees.PP.value].size() @@ -42,6 +93,10 @@ def get_pipelined_model( raise ValueError( f"Unsupported pipeline schedule: {pp_schedule_name}. We only support single-stage schedules." ) + # torchtitan returns tuple of stages and models as depending on the schedule + # we might have multiple stages and model parts per rank. + # So far we don't support multi-stage schedules, which is why instead of tuples + # we work directly with the stage and model. stage, model = PipelineFactory._get_split_model( whole_model=whole_model, schedule_class=schedule_class, @@ -49,7 +104,9 @@ def get_pipelined_model( device=device, fqns_per_stage=fqns_per_stage, ) - return whole_model # TODO return pipelined model + + pipeline = Pipeline(stage=stage, model=model) + return pipeline @staticmethod def _get_split_model( @@ -171,4 +228,23 @@ def _build_stage_from_modules( device=device, group=pp_mesh.get_group("pp"), ) - return stage, whole_model + return stage, stage_modules + + @staticmethod + def get_scheduled_pipeline( + loss_fn: Loss, pp_schedule_name: str, batch_size: int, microbatch_size: int, pp_degree: int, pipeline: Pipeline + ) -> Pipeline: + # TODO: Addd validation in config that batch_size is divisible by microbatch_size + n_microbatches = batch_size // microbatch_size + num_total_stages = pp_degree + schedule_class = get_schedule_class(pp_schedule_name) + schedule = schedule_class( + stage=pipeline.stage, + n_microbatches=n_microbatches, + loss_fn=loss_fn, + ) + logger.info( + f"Using pipeline schedule {schedule} with {n_microbatches} microbatches and {num_total_stages} stages." + ) + pipeline.schedule = schedule + return pipeline From 9677bd6f09cb5372220c7cdac305f8f99769375a Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Mon, 18 Aug 2025 23:47:42 +0200 Subject: [PATCH 08/67] feat: wired up scheduled and staged pipelines. --- src/modalities/config/pydantic_if_types.py | 3 ++- .../pipeline_parallelism_configs.py | 18 +++++++++++++++++- src/modalities/registry/components.py | 12 +++++++++--- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/modalities/config/pydantic_if_types.py b/src/modalities/config/pydantic_if_types.py index eb7d0bce1..c91ad4549 100644 --- a/src/modalities/config/pydantic_if_types.py +++ b/src/modalities/config/pydantic_if_types.py @@ -21,7 +21,7 @@ from modalities.inference.text.inference_component import TextInferenceComponent from modalities.logging_broker.subscriber import MessageSubscriberIF from modalities.loss_functions import Loss -from modalities.models.parallelism.pipeline_parallelism import StagesGenerator +from modalities.models.parallelism.pipeline_parallelism import Pipeline, StagesGenerator from modalities.nn.model_initialization.initialization_if import ModelInitializationIF from modalities.tokenization.tokenizer_wrapper import TokenizerWrapper from modalities.training.gradient_clipping.gradient_clipper import GradientClipperIF @@ -85,3 +85,4 @@ def __get_pydantic_core_schema__( DatasetBatchGeneratorIF, PydanticThirdPartyTypeIF(DatasetBatchGeneratorIF) ] PydanticStagesGeneratorType = Annotated[StagesGenerator, PydanticThirdPartyTypeIF(StagesGenerator)] +PydanticPipelineType = Annotated[Pipeline, PydanticThirdPartyTypeIF(Pipeline)] diff --git a/src/modalities/models/parallelism/pipeline_parallelism_configs.py b/src/modalities/models/parallelism/pipeline_parallelism_configs.py index 61b8b5ba4..e86cc46be 100644 --- a/src/modalities/models/parallelism/pipeline_parallelism_configs.py +++ b/src/modalities/models/parallelism/pipeline_parallelism_configs.py @@ -4,19 +4,35 @@ from modalities.config.pydantic_if_types import ( PydanticDeviceMeshIFType, + PydanticPipelineType, PydanticPytorchModuleType, PydanticStagesGeneratorType, ) +from modalities.models.parallelism.pipeline_parallelism import PipelineSelectionTypes class FQNsPerStageGeneratorConfig(BaseModel): pass -class PipelinedModelConfig(BaseModel): +class StagedPipelineConfig(BaseModel): whole_model: PydanticPytorchModuleType stages_generator: PydanticStagesGeneratorType device_mesh: PydanticDeviceMeshIFType local_rank: Annotated[int, Field(strict=True, ge=0)] pp_schedule_name: str num_layers_per_stage: Annotated[int, Field(strict=True, ge=1)] + + +class ScheduledPipelineConfig(BaseModel): + loss_fn: PydanticPytorchModuleType + pp_schedule_name: str + batch_size: Annotated[int, Field(strict=True, ge=1)] + microbatch_size: Annotated[int, Field(strict=True, ge=1)] + pp_degree: Annotated[int, Field(strict=True, ge=2)] + pipeline: PydanticPipelineType + + +class ComponentSelectorFromPipelineConfig(BaseModel): + pipeline: PydanticPipelineType + selection_type: PipelineSelectionTypes diff --git a/src/modalities/registry/components.py b/src/modalities/registry/components.py index e6da12819..44d9820c4 100644 --- a/src/modalities/registry/components.py +++ b/src/modalities/registry/components.py @@ -86,8 +86,12 @@ from modalities.models.gpt2.gpt2_model import GPT2LLMConfig from modalities.models.huggingface.huggingface_model import HuggingFacePretrainedModel, HuggingFacePretrainedModelConfig from modalities.models.model_factory import GPT2ModelFactory, ModelFactory -from modalities.models.parallelism.pipeline_parallelism import PipelineFactory -from modalities.models.parallelism.pipeline_parallelism_configs import PipelinedModelConfig +from modalities.models.parallelism.pipeline_parallelism import ComponentSelectorFromPipeline, PipelineFactory +from modalities.models.parallelism.pipeline_parallelism_configs import ( + ComponentSelectorFromPipelineConfig, + ScheduledPipelineConfig, + StagedPipelineConfig, +) from modalities.models.parallelism.stages_generator import GPT2LLMStagesGenerator from modalities.models.parallelism.stages_generator_configs import GPT2LLMStagesGeneratorConfig from modalities.nn.model_initialization.composed_initialization import ( @@ -178,7 +182,9 @@ class ComponentEntity: ComponentEntity( "model", "debugging_enriched", ModelFactory.get_debugging_enriched_model, DebuggingEnrichedModelConfig ), - ComponentEntity("model", "pipelined", PipelineFactory.get_pipelined_model, PipelinedModelConfig), + ComponentEntity("pipeline", "staged", PipelineFactory.get_staged_pipeline, StagedPipelineConfig), + ComponentEntity("pipeline", "scheduled", PipelineFactory.get_scheduled_pipeline, ScheduledPipelineConfig), + ComponentEntity("pipeline", "selector", ComponentSelectorFromPipeline.select, ComponentSelectorFromPipelineConfig), # Pipeline Stages Generators ComponentEntity("stages_generator", "gpt2_stages_generator", GPT2LLMStagesGenerator, GPT2LLMStagesGeneratorConfig), # Device mesh From 7ac9edfd2578c3ab6c63ea29aed9057dfa22628b Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Mon, 18 Aug 2025 23:48:30 +0200 Subject: [PATCH 09/67] feat: added PP test config --- .../config_lorem_ipsum_long_fsdp2_pp.yaml | 395 ++++++++++++++++++ 1 file changed, 395 insertions(+) create mode 100644 config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml diff --git a/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml b/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml new file mode 100644 index 000000000..e5a3b61ce --- /dev/null +++ b/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml @@ -0,0 +1,395 @@ +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + paths: + checkpoint_saving_path: data/checkpoints + train_dataset_path: ./data/lorem_ipsum_long.pbin + test_dataset_path: ./data/lorem_ipsum.pbin + intervals: + training_log_interval_in_steps: 1 + checkpointing_interval_in_steps: 32 + evaluation_interval_in_steps: 32 + consistency_enforcement: + enforce_tokens_per_step_consistency: true + enforce_last_step_logged: false + enforce_last_step_evaluated: false + enforce_last_step_checkpointed: false + step_profile: + gradient_accumulation_steps: 1 + local_train_micro_batch_size: 1 + sequence_length: 256 + training_target: + num_target_tokens: + component_key: number_conversion + variant_key: num_tokens_from_packed_mem_map_dataset_continuous + config: + dataset_path: ${settings.paths.train_dataset_path} + sequence_length: ${settings.step_profile.sequence_length} + num_ranks: ${settings.cuda_env.world_size} + local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size} + gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps} + num_target_steps: # for the batch progress subscriber + component_key: number_conversion + variant_key: num_steps_from_num_tokens + config: + num_ranks: ${settings.cuda_env.world_size} + local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size} + global_num_tokens: ${settings.training_target.num_target_tokens} + sequence_length: ${settings.step_profile.sequence_length} + gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps} + training_progress: + global_num_seen_tokens: 0 + num_seen_steps: 0 + num_seen_samples: 0 + last_step: -1 + +collate_fn: + component_key: collate_fn + variant_key: gpt_2_llm_collator + config: + sample_key: ${settings.referencing_keys.sample_key} + target_key: ${settings.referencing_keys.target_key} + +train_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: ${settings.paths.train_dataset_path} + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +train_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + dataloader_tag: train + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.step_profile.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: resumable_distributed_sampler + config: + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: true + seed: 42 + drop_last: true + skip_num_global_samples: ${settings.training_progress.num_seen_samples} + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +test_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: ${settings.paths.test_dataset_path} + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +test_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + dataloader_tag: test + dataset: + instance_key: test_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.step_profile.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: false + drop_last: true + dataset: + instance_key: test_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +eval_dataloaders: + - instance_key: test_dataloader + pass_type: BY_REFERENCE + +checkpoint_saving: + component_key: checkpoint_saving + variant_key: default + config: + checkpoint_saving_strategy: + component_key: checkpoint_saving_strategy + variant_key: save_k_most_recent_checkpoints_strategy + config: + k: -1 # -1 to save all checkpoints + checkpoint_saving_execution: + component_key: checkpoint_saving_execution + variant_key: dcp + config: + checkpoint_path: ${settings.paths.checkpoint_saving_path} + global_rank: ${settings.cuda_env.global_rank} + experiment_id: ${settings.experiment_id} + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +device_mesh: + component_key: device_mesh + variant_key: default + config: + device_type: cuda + data_parallel_replicate_degree: 1 + pipeline_parallel_degree: 2 + data_parallel_shard_degree: -1 + world_size: ${settings.cuda_env.world_size} + +app_state: + component_key: app_state + variant_key: raw + config: + model: + instance_key: initialized_model + pass_type: BY_REFERENCE + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + lr_scheduler: + instance_key: lr_scheduler + pass_type: BY_REFERENCE + +initialized_model: + component_key: model + variant_key: model_initialized + config: + model: + instance_key: fsdp_model + pass_type: BY_REFERENCE + model_initializer: + component_key: model_initialization + variant_key: composed + config: + model_type: gpt2 + weight_init_type: scaled + mean: 0.0 + std: 0.02 + num_layers: ${model_raw.config.n_layer} + +scheduled_pipeline: + component_key: pipeline + variant_key: scheduled + config: + loss_fn: + instance_key: loss_fn + pass_type: BY_REFERENCE + pp_schedule_name: gpipe + batch_size: ${settings.step_profile.local_train_micro_batch_size} + microbatch_size: 1 + pp_degree: ${device_mesh.config.pipeline_parallel_degree} + # maybe better to use the fsdp model and the schedule here + # instead of passing in the staged pipeline? + # If fsdp_model creates a copy then this is not in the scope of + # the staged pipeline. + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + + + +fsdp_model: + component_key: model + variant_key: fsdp2_wrapped + config: + model: + instance_key: model_part + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + mixed_precision_settings: + param_dtype: BF_16 + reduce_dtype: BF_16 + block_names: [GPT2Block] + +model_part: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: MODEL + + +staged_pipeline: + component_key: model + variant_key: staged + config: + whole_model: + instance_key: model_raw + pass_type: BY_REFERENCE + stages_generator: + component_key: stages_generator + variant_key: gpt2_stages_generator + config: + num_model_layers: ${model_raw.config.n_layer} + input_layer_equivalence: 1 + output_layer_equivalence: 1 + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + local_rank: ${settings.cuda_env.local_rank} + pp_schedule_name: gpipe + num_layers_per_stage: 2 + +model_raw: + component_key: model + variant_key: gpt2 + config: + use_meta_device: true + use_weight_tying: false + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + sequence_length: ${settings.step_profile.sequence_length} + prediction_key: ${loss_fn.config.prediction_key} + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 2 + n_head_q: 8 + n_head_kv: 4 + ffn_hidden: 128 + n_embd: 128 + dropout: 0.0 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model_raw.config.n_embd} + n_head: ${model_raw.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + base_freq: 10000 + attention_implementation: manual + activation_type: swiglu + attention_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + ffn_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + lm_head_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + +lr_scheduler: + component_key: scheduler + variant_key: onecycle_lr + config: + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + max_lr: 6e-4 + div_factor: 10 + final_div_factor: 1 + total_steps: ${settings.training_target.num_target_steps} + pct_start: 0.01 + anneal_strategy: cos + last_epoch: ${settings.training_progress.last_step} + +optimizer: + component_key: optimizer + variant_key: adam_w + config: + lr: 0.0001 + betas: [0.9, 0.95] + eps: 1e-8 + weight_decay: 1e-1 + weight_decay_groups_excluded: [embedding, layernorm] + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + +gradient_clipper: + component_key: gradient_clipper + variant_key: fsdp2 + config: + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + norm_type: P2_NORM + max_norm: 1.0 + +progress_subscriber: + component_key: progress_subscriber + variant_key: rich + config: + global_rank: ${settings.cuda_env.global_rank} + num_seen_steps: ${settings.training_progress.num_seen_steps} + num_target_steps: ${settings.training_target.num_target_steps} + train_dataloader_tag: ${train_dataloader.config.dataloader_tag} + eval_dataloaders: + instance_key: eval_dataloaders + pass_type: BY_REFERENCE + +evaluation_subscriber: + component_key: results_subscriber + variant_key: wandb + config: + global_rank: ${settings.cuda_env.global_rank} + project: modalities_dcp_tests + mode: OFFLINE + experiment_id: ${settings.experiment_id} + directory: wandb_storage + config_file_path: ${settings.config_file_path} + +# mfu_calculator: +# component_key: mfu_calculator +# variant_key: gpt2 +# config: +# n_layer: ${model_raw.config.n_layer} +# sequence_length: ${settings.step_profile.sequence_length} +# n_embd: ${model_raw.config.n_embd} +# world_size: ${settings.cuda_env.world_size} +# raw_model: +# instance_key: model_raw +# pass_type: BY_REFERENCE +# wrapped_model: +# instance_key: initialized_model +# pass_type: BY_REFERENCE \ No newline at end of file From d9f63c11d4f0f53a823fc60dc6f016169a185100 Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Tue, 19 Aug 2025 14:39:09 +0200 Subject: [PATCH 10/67] refactor: staging is now fully instantiable --- .../config_lorem_ipsum_long_fsdp2_pp.yaml | 34 ++++++++++++++----- src/modalities/config/pydantic_if_types.py | 2 ++ .../parallelism/pipeline_parallelism.py | 27 ++++++++++++--- .../pipeline_parallelism_configs.py | 12 +++++-- .../parallelism/stages_generator_configs.py | 2 +- src/modalities/registry/components.py | 2 ++ 6 files changed, 63 insertions(+), 16 deletions(-) diff --git a/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml b/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml index e5a3b61ce..fa2343b93 100644 --- a/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml +++ b/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml @@ -24,7 +24,7 @@ settings: enforce_last_step_checkpointed: false step_profile: gradient_accumulation_steps: 1 - local_train_micro_batch_size: 1 + local_train_micro_batch_size: 2 sequence_length: 256 training_target: num_target_tokens: @@ -190,13 +190,19 @@ app_state: instance_key: lr_scheduler pass_type: BY_REFERENCE + initialized_model: component_key: model variant_key: model_initialized config: model: - instance_key: fsdp_model - pass_type: BY_REFERENCE + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: scheduled_pipeline + pass_type: BY_REFERENCE + selection_type: MODEL model_initializer: component_key: model_initialization variant_key: composed @@ -223,11 +229,21 @@ scheduled_pipeline: # If fsdp_model creates a copy then this is not in the scope of # the staged pipeline. pipeline: - instance_key: staged_pipeline - pass_type: BY_REFERENCE - - - + component_key: pipeline + variant_key: builder + config: + stage: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: STAGE + model: + instance_key: fsdp_model + pass_type: BY_REFERENCE + fsdp_model: component_key: model variant_key: fsdp2_wrapped @@ -254,7 +270,7 @@ model_part: staged_pipeline: - component_key: model + component_key: pipeline variant_key: staged config: whole_model: diff --git a/src/modalities/config/pydantic_if_types.py b/src/modalities/config/pydantic_if_types.py index c91ad4549..2aeceb53c 100644 --- a/src/modalities/config/pydantic_if_types.py +++ b/src/modalities/config/pydantic_if_types.py @@ -7,6 +7,7 @@ from torch.distributed.device_mesh import DeviceMesh from torch.distributed.fsdp import FSDPModule as FSDP2 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP1 +from torch.distributed.pipelining import PipelineStage from torch.optim import Optimizer from torch.optim.lr_scheduler import LRScheduler from torch.utils.data import Sampler @@ -86,3 +87,4 @@ def __get_pydantic_core_schema__( ] PydanticStagesGeneratorType = Annotated[StagesGenerator, PydanticThirdPartyTypeIF(StagesGenerator)] PydanticPipelineType = Annotated[Pipeline, PydanticThirdPartyTypeIF(Pipeline)] +PydanticPipelineStageType = Annotated[PipelineStage, PydanticThirdPartyTypeIF(PipelineStage)] diff --git a/src/modalities/models/parallelism/pipeline_parallelism.py b/src/modalities/models/parallelism/pipeline_parallelism.py index e9ac0c755..b842fd75c 100644 --- a/src/modalities/models/parallelism/pipeline_parallelism.py +++ b/src/modalities/models/parallelism/pipeline_parallelism.py @@ -39,7 +39,19 @@ def is_first_stage(self) -> bool: def is_last_stage(self) -> bool: return self._stage.is_last - @property.setter + @property + def stage(self) -> PipelineStage: + return self._stage + + @property + def model(self) -> nn.Module: + return self._model + + @property + def schedule(self) -> Optional[PipelineScheduleSingle]: + return self._schedule + + @schedule.setter def schedule(self, schedule: PipelineScheduleSingle): self._schedule = schedule @@ -47,9 +59,9 @@ def schedule(self, schedule: PipelineScheduleSingle): class PipelineSelectionTypes(Enum): """Enum for pipeline selection types.""" - STAGE = "stage" - MODEL = "model" - SCHEDULE = "schedule" + STAGE = "STAGE" + MODEL = "MODEL" + SCHEDULE = "SCHEDULE" class ComponentSelectorFromPipeline: @@ -69,6 +81,12 @@ def select(pipeline: Pipeline, selection_type: PipelineSelectionTypes) -> Any: class PipelineFactory: """Pipeline factory class to create pipelined models.""" + @staticmethod + def get_pipeline( + stage: PipelineStage, model: nn.Module, schedule: Optional[PipelineScheduleSingle] = None + ) -> Pipeline: + return Pipeline(stage=stage, model=model, schedule=schedule) + @staticmethod def get_staged_pipeline( whole_model: nn.Module, @@ -235,6 +253,7 @@ def get_scheduled_pipeline( loss_fn: Loss, pp_schedule_name: str, batch_size: int, microbatch_size: int, pp_degree: int, pipeline: Pipeline ) -> Pipeline: # TODO: Addd validation in config that batch_size is divisible by microbatch_size + # and n_microbatches must be >= pp_degree n_microbatches = batch_size // microbatch_size num_total_stages = pp_degree schedule_class = get_schedule_class(pp_schedule_name) diff --git a/src/modalities/models/parallelism/pipeline_parallelism_configs.py b/src/modalities/models/parallelism/pipeline_parallelism_configs.py index e86cc46be..c1aa23d48 100644 --- a/src/modalities/models/parallelism/pipeline_parallelism_configs.py +++ b/src/modalities/models/parallelism/pipeline_parallelism_configs.py @@ -4,6 +4,8 @@ from modalities.config.pydantic_if_types import ( PydanticDeviceMeshIFType, + PydanticLossIFType, + PydanticPipelineStageType, PydanticPipelineType, PydanticPytorchModuleType, PydanticStagesGeneratorType, @@ -11,7 +13,7 @@ from modalities.models.parallelism.pipeline_parallelism import PipelineSelectionTypes -class FQNsPerStageGeneratorConfig(BaseModel): +class FQNsPerStageGeneratorConfig(BaseModel): # TODO duplicate pass @@ -25,7 +27,7 @@ class StagedPipelineConfig(BaseModel): class ScheduledPipelineConfig(BaseModel): - loss_fn: PydanticPytorchModuleType + loss_fn: PydanticLossIFType pp_schedule_name: str batch_size: Annotated[int, Field(strict=True, ge=1)] microbatch_size: Annotated[int, Field(strict=True, ge=1)] @@ -36,3 +38,9 @@ class ScheduledPipelineConfig(BaseModel): class ComponentSelectorFromPipelineConfig(BaseModel): pipeline: PydanticPipelineType selection_type: PipelineSelectionTypes + + +class PipelineConfig(BaseModel): + stage: PydanticPipelineStageType + model: PydanticPytorchModuleType + schedule: PydanticPipelineType | None = None diff --git a/src/modalities/models/parallelism/stages_generator_configs.py b/src/modalities/models/parallelism/stages_generator_configs.py index 610be7fdd..5d53f091d 100644 --- a/src/modalities/models/parallelism/stages_generator_configs.py +++ b/src/modalities/models/parallelism/stages_generator_configs.py @@ -3,7 +3,7 @@ from pydantic import BaseModel, Field -class FQNsPerStageGeneratorConfig(BaseModel): +class FQNsPerStageGeneratorConfig(BaseModel): # TODO duplicate pass diff --git a/src/modalities/registry/components.py b/src/modalities/registry/components.py index 44d9820c4..167a29894 100644 --- a/src/modalities/registry/components.py +++ b/src/modalities/registry/components.py @@ -89,6 +89,7 @@ from modalities.models.parallelism.pipeline_parallelism import ComponentSelectorFromPipeline, PipelineFactory from modalities.models.parallelism.pipeline_parallelism_configs import ( ComponentSelectorFromPipelineConfig, + PipelineConfig, ScheduledPipelineConfig, StagedPipelineConfig, ) @@ -185,6 +186,7 @@ class ComponentEntity: ComponentEntity("pipeline", "staged", PipelineFactory.get_staged_pipeline, StagedPipelineConfig), ComponentEntity("pipeline", "scheduled", PipelineFactory.get_scheduled_pipeline, ScheduledPipelineConfig), ComponentEntity("pipeline", "selector", ComponentSelectorFromPipeline.select, ComponentSelectorFromPipelineConfig), + ComponentEntity("pipeline", "builder", PipelineFactory.get_pipeline, PipelineConfig), # Pipeline Stages Generators ComponentEntity("stages_generator", "gpt2_stages_generator", GPT2LLMStagesGenerator, GPT2LLMStagesGeneratorConfig), # Device mesh From 83c87b9d6d6fbbb228bab31dccf1870b12679775 Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Tue, 19 Aug 2025 14:39:58 +0200 Subject: [PATCH 11/67] feat: drafted pp e2e test for fwd/bwd pass --- .../pipeline_parallelism/__init__.py | 0 ...orem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml | 171 ++++++++++++++++++ .../test_pp_fwd_bwd_pass.py | 104 +++++++++++ 3 files changed, 275 insertions(+) create mode 100644 tests/fsdp2_parallelization/pipeline_parallelism/__init__.py create mode 100644 tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml create mode 100644 tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/__init__.py b/tests/fsdp2_parallelization/pipeline_parallelism/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml new file mode 100644 index 000000000..88182d266 --- /dev/null +++ b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml @@ -0,0 +1,171 @@ +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + step_profile: + gradient_accumulation_steps: 1 + local_train_micro_batch_size: 2 + sequence_length: 256 + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +device_mesh: + component_key: device_mesh + variant_key: default + config: + device_type: cuda + data_parallel_replicate_degree: 1 + pipeline_parallel_degree: 2 + data_parallel_shard_degree: -1 + world_size: ${settings.cuda_env.world_size} + +initialized_model: + component_key: model + variant_key: model_initialized + config: + model: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: scheduled_pipeline + pass_type: BY_REFERENCE + selection_type: MODEL + model_initializer: + component_key: model_initialization + variant_key: composed + config: + model_type: gpt2 + weight_init_type: scaled + mean: 0.0 + std: 0.02 + num_layers: ${model_raw.config.n_layer} + +scheduled_pipeline: + component_key: pipeline + variant_key: scheduled + config: + loss_fn: + instance_key: loss_fn + pass_type: BY_REFERENCE + pp_schedule_name: gpipe + batch_size: ${settings.step_profile.local_train_micro_batch_size} + microbatch_size: 1 + pp_degree: ${device_mesh.config.pipeline_parallel_degree} + pipeline: + component_key: pipeline + variant_key: builder + config: + stage: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: STAGE + model: + instance_key: fsdp_model + pass_type: BY_REFERENCE + +fsdp_model: + component_key: model + variant_key: fsdp2_wrapped + config: + model: + instance_key: model_part + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + mixed_precision_settings: + param_dtype: BF_16 + reduce_dtype: BF_16 + block_names: [GPT2Block] + +model_part: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: MODEL + +staged_pipeline: + component_key: pipeline + variant_key: staged + config: + whole_model: + instance_key: model_raw + pass_type: BY_REFERENCE + stages_generator: + component_key: stages_generator + variant_key: gpt2_stages_generator + config: + num_model_layers: ${model_raw.config.n_layer} + input_layer_equivalence: 1 + output_layer_equivalence: 1 + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + local_rank: ${settings.cuda_env.local_rank} + pp_schedule_name: gpipe + num_layers_per_stage: 2 + +model_raw: + component_key: model + variant_key: gpt2 + config: + use_meta_device: true + use_weight_tying: false + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + sequence_length: ${settings.step_profile.sequence_length} + prediction_key: ${loss_fn.config.prediction_key} + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 2 + n_head_q: 8 + n_head_kv: 4 + ffn_hidden: 128 + n_embd: 128 + dropout: 0.0 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model_raw.config.n_embd} + n_head: ${model_raw.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + base_freq: 10000 + attention_implementation: manual + activation_type: swiglu + attention_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + ffn_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + lm_head_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py new file mode 100644 index 000000000..fc24223e9 --- /dev/null +++ b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py @@ -0,0 +1,104 @@ +import os +import tempfile +from pathlib import Path + +import pytest +import torch +import torch.multiprocessing as mp +import yaml +from pydantic import BaseModel + +from modalities.__main__ import Main +from modalities.config.config import ProcessGroupBackendType +from modalities.config.pydantic_if_types import PydanticFSDP2ModuleType, PydanticPipelineType +from tests.end2end_tests.custom_components import MultiProcessingCudaEnv + + +@pytest.fixture +def temp_file_path() -> Path: + # Create a NamedTemporaryFile that persists after closing (delete=False) + with tempfile.NamedTemporaryFile(delete=False) as tf: + file_path = tf.name + try: + yield Path(file_path) + finally: + # Clean up the file after the test + if os.path.exists(file_path): + os.remove(file_path) + + +class ComponentsInstantiationModel(BaseModel): + initialized_model: PydanticFSDP2ModuleType + scheduled_pipeline: PydanticPipelineType + + +@pytest.mark.skipif( + torch.cuda.device_count() < 8, + reason="This test requires 8 GPUs", +) +class TestPipelineParallelism: + def _get_tmp_sharding_config_path( + self, sharding_degree: int, tp_degree: int, pp_degree: int, temp_file_path: Path + ) -> Path: + working_dir = Path(os.path.dirname(__file__)) + config_file_path = working_dir / "configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml" + + with open(config_file_path, "r") as file: + config_string = file.read() + config_dict = yaml.safe_load(config_string) + config_dict["device_mesh"]["config"]["data_parallel_shard_degree"] = sharding_degree + config_dict["device_mesh"]["config"]["tensor_parallel_degree"] = tp_degree + config_dict["device_mesh"]["config"]["pipeline_parallel_degree"] = pp_degree + + # save to temporary file + with open(temp_file_path, "w") as file: + yaml.dump(config_dict, file) + + return temp_file_path + + def _get_components(self, config_file_path: Path) -> ComponentsInstantiationModel: + main_obj = Main(config_file_path) + components: ComponentsInstantiationModel = main_obj.build_components( + components_model_type=ComponentsInstantiationModel + ) + return components + + @pytest.mark.parametrize( + "sharding_degree, tp_degree, pp_degree, world_size", + [ + (2, 1, 2, 4), + # (2, 1, 4, 8), + # (2, 2, 2, 8), # TODO need to support this case + ], + ) + def test_pp(self, sharding_degree: int, tp_degree: int, pp_degree: int, world_size: int, temp_file_path: Path): + tmp_sharding_config_path = self._get_tmp_sharding_config_path( + sharding_degree=sharding_degree, + tp_degree=tp_degree, + pp_degree=pp_degree, + temp_file_path=temp_file_path, + ) + mp.spawn( + self._test_pp_impl, + args=(world_size, sharding_degree, tmp_sharding_config_path), + nprocs=world_size, + join=True, + ) + + def _test_pp_impl( + self, + process_id: int, + world_size: int, + sharding_degree: int, + gpt2_model_config_path: Path, + ): + # wraps the actual test function to be able to run it in a distributed multiprocessing setup + with MultiProcessingCudaEnv( + process_group_backend=ProcessGroupBackendType.nccl, + global_rank=process_id, + local_rank=process_id, + world_size=world_size, + rdvz_port=22356, + ): + self._get_components(gpt2_model_config_path) + pass From 95f24701fc9940e565893668e6d07cd6dc93b3ca Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Fri, 29 Aug 2025 09:55:12 +0200 Subject: [PATCH 12/67] refactor: renamings in the context of PP --- .../parallelism/pipeline_parallelism.py | 74 +++++++++---------- .../pipeline_parallelism_configs.py | 6 +- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/modalities/models/parallelism/pipeline_parallelism.py b/src/modalities/models/parallelism/pipeline_parallelism.py index b842fd75c..006d97a55 100644 --- a/src/modalities/models/parallelism/pipeline_parallelism.py +++ b/src/modalities/models/parallelism/pipeline_parallelism.py @@ -23,57 +23,57 @@ class Pipeline: def __init__( self, - stage: PipelineStage, - model: nn.Module, - schedule: Optional[PipelineScheduleSingle] = None, + pp_stage: PipelineStage, + model_part: nn.Module, + pp_schedule: Optional[PipelineScheduleSingle] = None, ): - self._stage = stage - self._model = model - self._schedule = schedule + self._pp_stage = pp_stage + self._model_part = model_part + self._pp_schedule = pp_schedule @property - def is_first_stage(self) -> bool: - return self._stage.is_first + def is_first_pp_stage(self) -> bool: + return self._pp_stage.is_first @property - def is_last_stage(self) -> bool: - return self._stage.is_last + def is_last_pp_stage(self) -> bool: + return self._pp_stage.is_last @property - def stage(self) -> PipelineStage: - return self._stage + def pp_stage(self) -> PipelineStage: + return self._pp_stage @property - def model(self) -> nn.Module: - return self._model + def model_part(self) -> nn.Module: + return self._model_part @property - def schedule(self) -> Optional[PipelineScheduleSingle]: - return self._schedule + def pp_schedule(self) -> Optional[PipelineScheduleSingle]: + return self._pp_schedule - @schedule.setter - def schedule(self, schedule: PipelineScheduleSingle): - self._schedule = schedule + @pp_schedule.setter + def pp_schedule(self, schedule: PipelineScheduleSingle): + self._pp_schedule = schedule class PipelineSelectionTypes(Enum): """Enum for pipeline selection types.""" - STAGE = "STAGE" - MODEL = "MODEL" - SCHEDULE = "SCHEDULE" + PP_STAGE = "PP_STAGE" + MODEL_PART = "MODEL_PART" + PP_SCHEDULE = "PP_SCHEDULE" class ComponentSelectorFromPipeline: @staticmethod def select(pipeline: Pipeline, selection_type: PipelineSelectionTypes) -> Any: """Selects a component from the pipeline based on the selection type.""" - if selection_type == PipelineSelectionTypes.STAGE: - return pipeline._stage - elif selection_type == PipelineSelectionTypes.MODEL: - return pipeline._model - elif selection_type == PipelineSelectionTypes.SCHEDULE: - return pipeline._schedule + if selection_type == PipelineSelectionTypes.PP_STAGE: + return pipeline.pp_stage + elif selection_type == PipelineSelectionTypes.MODEL_PART: + return pipeline.model_part + elif selection_type == PipelineSelectionTypes.PP_SCHEDULE: + return pipeline.pp_schedule else: raise ValueError(f"Unsupported selection type: {selection_type}") @@ -83,9 +83,9 @@ class PipelineFactory: @staticmethod def get_pipeline( - stage: PipelineStage, model: nn.Module, schedule: Optional[PipelineScheduleSingle] = None + pp_stage: PipelineStage, model_part: nn.Module, pp_schedule: Optional[PipelineScheduleSingle] = None ) -> Pipeline: - return Pipeline(stage=stage, model=model, schedule=schedule) + return Pipeline(pp_stage=pp_stage, model_part=model_part, pp_schedule=pp_schedule) @staticmethod def get_staged_pipeline( @@ -115,7 +115,7 @@ def get_staged_pipeline( # we might have multiple stages and model parts per rank. # So far we don't support multi-stage schedules, which is why instead of tuples # we work directly with the stage and model. - stage, model = PipelineFactory._get_split_model( + pp_stage, model_part = PipelineFactory._get_split_model( whole_model=whole_model, schedule_class=schedule_class, pp_mesh=pp_mesh, @@ -123,7 +123,7 @@ def get_staged_pipeline( fqns_per_stage=fqns_per_stage, ) - pipeline = Pipeline(stage=stage, model=model) + pipeline = Pipeline(pp_stage=pp_stage, model_part=model_part) return pipeline @staticmethod @@ -256,14 +256,14 @@ def get_scheduled_pipeline( # and n_microbatches must be >= pp_degree n_microbatches = batch_size // microbatch_size num_total_stages = pp_degree - schedule_class = get_schedule_class(pp_schedule_name) - schedule = schedule_class( - stage=pipeline.stage, + pp_schedule_class = get_schedule_class(pp_schedule_name) + pp_schedule = pp_schedule_class( + stage=pipeline.pp_stage, n_microbatches=n_microbatches, loss_fn=loss_fn, ) logger.info( - f"Using pipeline schedule {schedule} with {n_microbatches} microbatches and {num_total_stages} stages." + f"Using pipeline schedule {pp_schedule} with {n_microbatches} microbatches and {num_total_stages} stages." ) - pipeline.schedule = schedule + pipeline.pp_schedule = pp_schedule return pipeline diff --git a/src/modalities/models/parallelism/pipeline_parallelism_configs.py b/src/modalities/models/parallelism/pipeline_parallelism_configs.py index c1aa23d48..831a6e15e 100644 --- a/src/modalities/models/parallelism/pipeline_parallelism_configs.py +++ b/src/modalities/models/parallelism/pipeline_parallelism_configs.py @@ -41,6 +41,6 @@ class ComponentSelectorFromPipelineConfig(BaseModel): class PipelineConfig(BaseModel): - stage: PydanticPipelineStageType - model: PydanticPytorchModuleType - schedule: PydanticPipelineType | None = None + pp_stage: PydanticPipelineStageType + model_part: PydanticPytorchModuleType + pp_schedule: PydanticPipelineType | None = None From 521e5867559c984c71ab98b12d58a349c66d69cd Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Fri, 29 Aug 2025 09:56:39 +0200 Subject: [PATCH 13/67] chore: drafted the first PP test. --- ...orem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml | 18 +++++----- .../test_pp_fwd_bwd_pass.py | 34 ++++++++++++++++--- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml index 88182d266..0ceb02a53 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml +++ b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml @@ -11,7 +11,7 @@ settings: world_size: ${cuda_env:WORLD_SIZE} step_profile: gradient_accumulation_steps: 1 - local_train_micro_batch_size: 2 + local_train_micro_batch_size: 4 sequence_length: 256 loss_fn: @@ -42,7 +42,7 @@ initialized_model: pipeline: instance_key: scheduled_pipeline pass_type: BY_REFERENCE - selection_type: MODEL + selection_type: MODEL_PART model_initializer: component_key: model_initialization variant_key: composed @@ -62,21 +62,21 @@ scheduled_pipeline: pass_type: BY_REFERENCE pp_schedule_name: gpipe batch_size: ${settings.step_profile.local_train_micro_batch_size} - microbatch_size: 1 + microbatch_size: 2 pp_degree: ${device_mesh.config.pipeline_parallel_degree} pipeline: component_key: pipeline variant_key: builder config: - stage: + pp_stage: component_key: pipeline variant_key: selector config: pipeline: instance_key: staged_pipeline pass_type: BY_REFERENCE - selection_type: STAGE - model: + selection_type: PP_STAGE + model_part: instance_key: fsdp_model pass_type: BY_REFERENCE @@ -102,7 +102,7 @@ model_part: pipeline: instance_key: staged_pipeline pass_type: BY_REFERENCE - selection_type: MODEL + selection_type: MODEL_PART staged_pipeline: component_key: pipeline @@ -123,7 +123,7 @@ staged_pipeline: pass_type: BY_REFERENCE local_rank: ${settings.cuda_env.local_rank} pp_schedule_name: gpipe - num_layers_per_stage: 2 + num_layers_per_stage: 4 model_raw: component_key: model @@ -136,7 +136,7 @@ model_raw: sequence_length: ${settings.step_profile.sequence_length} prediction_key: ${loss_fn.config.prediction_key} vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency - n_layer: 2 + n_layer: 6 n_head_q: 8 n_head_kv: 4 ffn_hidden: 128 diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py index fc24223e9..6f861c1ea 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py +++ b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py @@ -11,6 +11,7 @@ from modalities.__main__ import Main from modalities.config.config import ProcessGroupBackendType from modalities.config.pydantic_if_types import PydanticFSDP2ModuleType, PydanticPipelineType +from modalities.models.parallelism.pipeline_parallelism import Pipeline from tests.end2end_tests.custom_components import MultiProcessingCudaEnv @@ -80,7 +81,7 @@ def test_pp(self, sharding_degree: int, tp_degree: int, pp_degree: int, world_si ) mp.spawn( self._test_pp_impl, - args=(world_size, sharding_degree, tmp_sharding_config_path), + args=(world_size, tmp_sharding_config_path), nprocs=world_size, join=True, ) @@ -89,7 +90,6 @@ def _test_pp_impl( self, process_id: int, world_size: int, - sharding_degree: int, gpt2_model_config_path: Path, ): # wraps the actual test function to be able to run it in a distributed multiprocessing setup @@ -100,5 +100,31 @@ def _test_pp_impl( world_size=world_size, rdvz_port=22356, ): - self._get_components(gpt2_model_config_path) - pass + components = self._get_components(gpt2_model_config_path) + scheduled_pipeline = components.scheduled_pipeline + vocab_size = 50304 + sequence_length = 256 + batch_size = 4 + sequences = torch.randint(0, vocab_size, (batch_size, sequence_length)) + targets = sequences[:, 1:].contiguous() + inputs = sequences[:, :-1].contiguous() + self._forward_step(scheduled_pipeline, inputs, targets) + + def _forward_step(self, scheduled_pipeline: Pipeline, inputs: torch.Tensor, targets: torch.Tensor): + """Runs a forward step on the model.""" + pp_schedule = scheduled_pipeline.pp_schedule + targets, losses = (targets, []) if scheduled_pipeline.is_last_pp_stage else (None, None) + if scheduled_pipeline.is_first_pp_stage: # first stage + pp_schedule.step(inputs, target=targets, losses=losses, input_batch=inputs) + else: # non-first stage + pp_schedule.step(target=targets, losses=losses, input_batch=inputs) + + # accumulate losses across pipeline microbatches + # TODO: PP+FSDP unexpectedly puts the loss back to the CPU + ( + torch.mean(torch.stack(losses)).to(self.device) + if self.pp_has_last_stage + else torch.tensor([-1.0], device=self.device) + ) + + # return output From 002b0ae557411351dc274be97f7e0e6c59c0afd8 Mon Sep 17 00:00:00 2001 From: Max Luebbering <2804731+le1nux@users.noreply.github.com> Date: Sat, 30 Aug 2025 00:46:18 +0200 Subject: [PATCH 14/67] chore: pp config fixes --- .../training/config_lorem_ipsum_long_fsdp2_pp.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml b/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml index fa2343b93..381550a20 100644 --- a/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml +++ b/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml @@ -202,7 +202,7 @@ initialized_model: pipeline: instance_key: scheduled_pipeline pass_type: BY_REFERENCE - selection_type: MODEL + selection_type: MODEL_PART model_initializer: component_key: model_initialization variant_key: composed @@ -232,15 +232,15 @@ scheduled_pipeline: component_key: pipeline variant_key: builder config: - stage: + pp_stage: component_key: pipeline variant_key: selector config: pipeline: instance_key: staged_pipeline pass_type: BY_REFERENCE - selection_type: STAGE - model: + selection_type: PP_STAGE + model_part: instance_key: fsdp_model pass_type: BY_REFERENCE @@ -266,7 +266,7 @@ model_part: pipeline: instance_key: staged_pipeline pass_type: BY_REFERENCE - selection_type: MODEL + selection_type: MODEL_PART staged_pipeline: From 1d4943f5c065af94164059739dca15f0c2f72049 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Fri, 5 Sep 2025 18:14:47 +0200 Subject: [PATCH 15/67] feat: Make test for pipeline parallelism work --- src/modalities/loss_functions.py | 8 ++ src/modalities/models/gpt2/gpt2_model.py | 55 +++++++++ src/modalities/models/model_factory.py | 7 +- src/modalities/registry/components.py | 3 +- ...g_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml | 108 ++++++++++++++++++ ...orem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml | 3 +- .../test_pp_fwd_bwd_pass.py | 58 +++++++--- 7 files changed, 221 insertions(+), 21 deletions(-) create mode 100644 tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml diff --git a/src/modalities/loss_functions.py b/src/modalities/loss_functions.py index 54d8de36b..f46fb0398 100644 --- a/src/modalities/loss_functions.py +++ b/src/modalities/loss_functions.py @@ -44,6 +44,14 @@ def __call__(self, forward_batch: InferenceResultBatch) -> torch.Tensor: return loss +class CLMCrossEntropyLossPP(CLMCrossEntropyLoss): + def __call__(self, outputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor: + forward_batch = InferenceResultBatch( + predictions={self.prediction_key: outputs}, targets={self.target_key: targets} + ) + return super().__call__(forward_batch) + + def nce_loss( embedding1: torch.Tensor, embedding2: torch.Tensor, device: torch.device, is_asymmetric: bool, temperature: float ) -> torch.Tensor: diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py index a2022d716..76b0399ae 100644 --- a/src/modalities/models/gpt2/gpt2_model.py +++ b/src/modalities/models/gpt2/gpt2_model.py @@ -344,6 +344,7 @@ class GPT2LLMConfig(BaseModel): ffn_norm_config: LayerNormWrapperConfig lm_head_norm_config: LayerNormWrapperConfig use_weight_tying: bool + use_pp: Optional[bool] = False @model_validator(mode="after") def check_divisibility(self) -> "GPT2LLMConfig": @@ -930,6 +931,60 @@ def forward(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: return self.forward_impl(inputs) +class GPT2LLMPP(GPT2LLM): + """GPT2LLM class.""" + + def forward_impl(self, inputs: torch.Tensor) -> torch.Tensor: + """ + Forward pass implementation of the GPT2LLM module. + + Args: + inputs (dict[str, torch.Tensor]): A dictionary containing input tensors. + - sample_key (str): Key for the input tensor containing token ids. + + Returns: + dict[str, torch.Tensor]: A dictionary containing output tensors. + - prediction_key (str): Key for the output tensor containing logits. + """ + device = inputs.device + t = inputs.size(1) # batch size, sequence length + assert t <= self.sequence_length, f"Cannot forward sequence of length {t}, the model's maximum " + f"input sequence length is only {self.sequence_length}" + + # forward the GPT model itself + h = ( + self.transformer.wte(inputs) if hasattr(self.transformer, "wte") else inputs + ) # token embeddings of shape (b, t, n_embd) + + if self.poe_type is PositionTypes.ABSOLUTE and hasattr(self.transformer, "wpe"): + pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t) + pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd) + h = h + pos_emb + + # TODO: use drop out also without absolute position embedding? + h = self.transformer.drop(h) if hasattr(self.transformer, "drop") else h + + for block in self.transformer.h: + h = block(h) + h = self.transformer.lm_head_norm(h) if hasattr(self.transformer, "lm_head_norm") else h + h = self.transformer.lm_head(h) if hasattr(self.transformer, "lm_head") else h + return h + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the GPT2LLM module. + + Args: + inputs (dict[str, torch.Tensor]): A dictionary containing input tensors. + - sample_key (str): Key for the input tensor containing token ids. + + Returns: + dict[str, torch.Tensor]: A dictionary containing output tensors. + - prediction_key (str): Key for the output tensor containing logits. + """ + return self.forward_impl(inputs) + + def manual_scaled_dot_product_attention( query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None ) -> torch.Tensor: diff --git a/src/modalities/models/model_factory.py b/src/modalities/models/model_factory.py index 877c9cbdc..dc3d084a8 100644 --- a/src/modalities/models/model_factory.py +++ b/src/modalities/models/model_factory.py @@ -28,6 +28,7 @@ from modalities.exceptions import ModelStateError from modalities.models.gpt2.gpt2_model import ( GPT2LLM, + GPT2LLMPP, AttentionConfig, AttentionImplementation, LayerNormWrapperConfig, @@ -568,6 +569,7 @@ def get_gpt2_model( use_weight_tying: bool, use_meta_device: Optional[bool] = False, seed: int = None, + use_pp: Optional[bool] = False, ) -> GPT2LLM: config = dict( sample_key=sample_key, @@ -597,11 +599,12 @@ def get_gpt2_model( "Please set at least use_meta_device=False or use_weight_tying=False." "https://github.com/Modalities/modalities/issues/357" ) + gpt2_model_class = GPT2LLMPP if use_pp else GPT2LLM if use_meta_device: with torch.device("meta"): - model = GPT2LLM(**config) + model = gpt2_model_class(**config) else: - model = GPT2LLM(**config) + model = gpt2_model_class(**config) return model @staticmethod diff --git a/src/modalities/registry/components.py b/src/modalities/registry/components.py index 167a29894..b3a0a7618 100644 --- a/src/modalities/registry/components.py +++ b/src/modalities/registry/components.py @@ -78,7 +78,7 @@ ProgressSubscriberFactory, ResultsSubscriberFactory, ) -from modalities.loss_functions import CLMCrossEntropyLoss +from modalities.loss_functions import CLMCrossEntropyLoss, CLMCrossEntropyLossPP from modalities.models.coca.coca_model import CoCa, CoCaConfig from modalities.models.coca.collator import CoCaCollateFnConfig, CoCaCollatorFn from modalities.models.components.layer_norms import LayerNormConfig, RMSLayerNorm, RMSLayerNormConfig @@ -200,6 +200,7 @@ class ComponentEntity: ), # losses ComponentEntity("loss", "clm_cross_entropy_loss", CLMCrossEntropyLoss, CLMCrossEntropyLossConfig), + ComponentEntity("loss", "clm_cross_entropy_loss_pp", CLMCrossEntropyLossPP, CLMCrossEntropyLossConfig), # optmizers ComponentEntity("optimizer", "adam", OptimizerFactory.get_adam, AdamOptimizerConfig), ComponentEntity("optimizer", "adam_w", OptimizerFactory.get_adam_w, AdamWOptimizerConfig), diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml new file mode 100644 index 000000000..6603b1850 --- /dev/null +++ b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml @@ -0,0 +1,108 @@ +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + step_profile: + gradient_accumulation_steps: 1 + local_train_micro_batch_size: 4 + sequence_length: 256 + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +device_mesh: + component_key: device_mesh + variant_key: default + config: + device_type: cuda + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: -1 + world_size: ${settings.cuda_env.world_size} + +initialized_model: + component_key: model + variant_key: model_initialized + config: + model: + instance_key: fsdp_model + pass_type: BY_REFERENCE + model_initializer: + component_key: model_initialization + variant_key: composed + config: + model_type: gpt2 + weight_init_type: scaled + mean: 0.0 + std: 0.02 + num_layers: ${model_raw.config.n_layer} + +fsdp_model: + component_key: model + variant_key: fsdp2_wrapped + config: + model: + instance_key: model_raw + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + mixed_precision_settings: + param_dtype: BF_16 + reduce_dtype: BF_16 + block_names: [GPT2Block] + +model_raw: + component_key: model + variant_key: gpt2 + config: + use_meta_device: true + use_weight_tying: false + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + sequence_length: ${settings.step_profile.sequence_length} + prediction_key: ${loss_fn.config.prediction_key} + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 6 + n_head_q: 8 + n_head_kv: 4 + ffn_hidden: 128 + n_embd: 128 + dropout: 0.0 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model_raw.config.n_embd} + n_head: ${model_raw.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + base_freq: 10000 + attention_implementation: manual + activation_type: swiglu + attention_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + ffn_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + lm_head_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml index 0ceb02a53..5ef5f148a 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml +++ b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml @@ -16,7 +16,7 @@ settings: loss_fn: component_key: loss - variant_key: clm_cross_entropy_loss + variant_key: clm_cross_entropy_loss_pp config: target_key: ${settings.referencing_keys.target_key} prediction_key: ${settings.referencing_keys.prediction_key} @@ -129,6 +129,7 @@ model_raw: component_key: model variant_key: gpt2 config: + use_pp: true use_meta_device: true use_weight_tying: false sample_key: ${settings.referencing_keys.sample_key} diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py index 6f861c1ea..7384b8338 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py +++ b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py @@ -9,8 +9,9 @@ from pydantic import BaseModel from modalities.__main__ import Main +from modalities.batch import InferenceResultBatch from modalities.config.config import ProcessGroupBackendType -from modalities.config.pydantic_if_types import PydanticFSDP2ModuleType, PydanticPipelineType +from modalities.config.pydantic_if_types import PydanticFSDP2ModuleType, PydanticLossIFType, PydanticPipelineType from modalities.models.parallelism.pipeline_parallelism import Pipeline from tests.end2end_tests.custom_components import MultiProcessingCudaEnv @@ -28,11 +29,16 @@ def temp_file_path() -> Path: os.remove(file_path) -class ComponentsInstantiationModel(BaseModel): +class ComponentsInstantiationPPModel(BaseModel): initialized_model: PydanticFSDP2ModuleType scheduled_pipeline: PydanticPipelineType +class ComponentsInstantiationModel(BaseModel): + initialized_model: PydanticFSDP2ModuleType + loss_fn: PydanticLossIFType + + @pytest.mark.skipif( torch.cuda.device_count() < 8, reason="This test requires 8 GPUs", @@ -57,11 +63,14 @@ def _get_tmp_sharding_config_path( return temp_file_path - def _get_components(self, config_file_path: Path) -> ComponentsInstantiationModel: + def _get_components(self, config_file_path: Path, use_pp: bool) -> ComponentsInstantiationPPModel: + torch.manual_seed(42) main_obj = Main(config_file_path) - components: ComponentsInstantiationModel = main_obj.build_components( - components_model_type=ComponentsInstantiationModel - ) + if use_pp: + components_model_type = ComponentsInstantiationPPModel + else: + components_model_type = ComponentsInstantiationModel + components: components_model_type = main_obj.build_components(components_model_type=components_model_type) return components @pytest.mark.parametrize( @@ -90,7 +99,7 @@ def _test_pp_impl( self, process_id: int, world_size: int, - gpt2_model_config_path: Path, + pp_model_config_path: Path, ): # wraps the actual test function to be able to run it in a distributed multiprocessing setup with MultiProcessingCudaEnv( @@ -100,7 +109,7 @@ def _test_pp_impl( world_size=world_size, rdvz_port=22356, ): - components = self._get_components(gpt2_model_config_path) + components = self._get_components(pp_model_config_path, use_pp=True) scheduled_pipeline = components.scheduled_pipeline vocab_size = 50304 sequence_length = 256 @@ -108,23 +117,38 @@ def _test_pp_impl( sequences = torch.randint(0, vocab_size, (batch_size, sequence_length)) targets = sequences[:, 1:].contiguous() inputs = sequences[:, :-1].contiguous() - self._forward_step(scheduled_pipeline, inputs, targets) + loss_pp = self._forward_step(scheduled_pipeline, inputs, targets) + + # if scheduled_pipeline.is_last_pp_stage: + working_dir = Path(os.path.dirname(__file__)) + fsdp2_model_config_path = working_dir / "configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml" + fsdp2_components = self._get_components(fsdp2_model_config_path, use_pp=False) + fsdp2_model = fsdp2_components.initialized_model + fsdp2_loss_fn = fsdp2_components.loss_fn + + input_dict = {"input_ids": inputs} + fsdp2_out = fsdp2_model(input_dict) + forward_batch = InferenceResultBatch(predictions=fsdp2_out, targets={fsdp2_loss_fn.target_key: targets}) + fsdp2_loss = fsdp2_loss_fn(forward_batch) + if scheduled_pipeline.is_last_pp_stage: + assert torch.allclose(fsdp2_loss, loss_pp, atol=1e-6, rtol=1e-5), "Outputs do not match" def _forward_step(self, scheduled_pipeline: Pipeline, inputs: torch.Tensor, targets: torch.Tensor): """Runs a forward step on the model.""" pp_schedule = scheduled_pipeline.pp_schedule targets, losses = (targets, []) if scheduled_pipeline.is_last_pp_stage else (None, None) if scheduled_pipeline.is_first_pp_stage: # first stage - pp_schedule.step(inputs, target=targets, losses=losses, input_batch=inputs) + # pp_schedule.step(inputs, target=targets, losses=losses, input_batch=inputs) + pp_schedule.step(inputs, target=targets, losses=losses) else: # non-first stage - pp_schedule.step(target=targets, losses=losses, input_batch=inputs) + # pp_schedule.step(target=targets, losses=losses, input_batch=inputs) + # pp_schedule.step(inputs, target=targets, losses=losses, input_batch=inputs) + pp_schedule.step(target=targets, losses=losses) # accumulate losses across pipeline microbatches # TODO: PP+FSDP unexpectedly puts the loss back to the CPU - ( - torch.mean(torch.stack(losses)).to(self.device) - if self.pp_has_last_stage - else torch.tensor([-1.0], device=self.device) + return ( + torch.mean(torch.stack(losses)).to(losses[0].device) + if scheduled_pipeline.is_last_pp_stage + else torch.tensor([-1.0], device=inputs.device) ) - - # return output From 5b53ff97b3df6b780fbf747dfa6293359d9c7f43 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Mon, 8 Sep 2025 12:29:55 +0200 Subject: [PATCH 16/67] refactor(parallelism): Removed necessity of additional model and loss classes for pipeline parallelism. --- src/modalities/loss_functions.py | 53 ++++++++++-- src/modalities/models/gpt2/gpt2_model.py | 82 ++++++------------- src/modalities/models/model_factory.py | 7 +- src/modalities/registry/components.py | 3 +- ...orem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml | 3 +- 5 files changed, 71 insertions(+), 77 deletions(-) diff --git a/src/modalities/loss_functions.py b/src/modalities/loss_functions.py index f46fb0398..e3be6100d 100644 --- a/src/modalities/loss_functions.py +++ b/src/modalities/loss_functions.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from typing import overload import torch from torch.nn import CrossEntropyLoss @@ -31,9 +32,16 @@ def __init__(self, target_key: str, prediction_key: str, tag: str = "CLMCrossEnt # Mean over the tokens in the local-batch (batch per rank) self.loss_fun = CrossEntropyLoss(reduction="mean") + @overload def __call__(self, forward_batch: InferenceResultBatch) -> torch.Tensor: - labels = forward_batch.get_targets(self.target_key) - lm_logits = forward_batch.get_predictions(self.prediction_key) + ... + + @overload + def __call__(self, outputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor: + ... + + def __call__(self, *args, **kwargs) -> torch.Tensor: + labels, lm_logits = self._parse_arguments(args, kwargs) # move labels to correct device to enable model parallelism labels = labels.to(lm_logits.device) @@ -43,13 +51,40 @@ def __call__(self, forward_batch: InferenceResultBatch) -> torch.Tensor: loss = self.loss_fun(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return loss - -class CLMCrossEntropyLossPP(CLMCrossEntropyLoss): - def __call__(self, outputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor: - forward_batch = InferenceResultBatch( - predictions={self.prediction_key: outputs}, targets={self.target_key: targets} - ) - return super().__call__(forward_batch) + def _parse_arguments( + self, + args: list[torch.Tensor] | list[InferenceResultBatch], + kwargs: dict[str, torch.Tensor] | dict[str, InferenceResultBatch], + ) -> tuple[torch.Tensor, torch.Tensor]: + if len(args) == 1 and isinstance(args[0], InferenceResultBatch): + forward_batch = args[0] + labels = forward_batch.get_targets(self.target_key) + lm_logits = forward_batch.get_predictions(self.prediction_key) + elif "forward_batch" in kwargs and isinstance(kwargs["forward_batch"], InferenceResultBatch): + forward_batch = kwargs["forward_batch"] + labels = forward_batch.get_targets(self.target_key) + lm_logits = forward_batch.get_predictions(self.prediction_key) + elif len(args) == 2 and all(isinstance(arg, torch.Tensor) for arg in args): + lm_logits, labels = args + elif ( + "outputs" in kwargs + and "targets" in kwargs + and isinstance(kwargs["outputs"], torch.Tensor) + and isinstance(kwargs["targets"], torch.Tensor) + ): + lm_logits = kwargs["outputs"] + labels = kwargs["targets"] + elif ( + len(args) == 1 + and "targets" in kwargs + and isinstance(args[0], torch.Tensor) + and isinstance(kwargs["targets"], torch.Tensor) + ): + lm_logits = args[0] + labels = kwargs["targets"] + else: + raise TypeError("Invalid arguments for CLMCrossEntropyLoss.__call__") + return labels, lm_logits def nce_loss( diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py index 76b0399ae..bdbd49913 100644 --- a/src/modalities/models/gpt2/gpt2_model.py +++ b/src/modalities/models/gpt2/gpt2_model.py @@ -2,7 +2,7 @@ import math from abc import abstractmethod from enum import Enum -from typing import Annotated, Optional +from typing import Annotated, Optional, overload import torch import torch.nn as nn @@ -344,7 +344,6 @@ class GPT2LLMConfig(BaseModel): ffn_norm_config: LayerNormWrapperConfig lm_head_norm_config: LayerNormWrapperConfig use_weight_tying: bool - use_pp: Optional[bool] = False @model_validator(mode="after") def check_divisibility(self) -> "GPT2LLMConfig": @@ -881,9 +880,10 @@ def __init__( self.transformer.lm_head.weight ) # https://paperswithcode.com/method/weight-tying - def forward_impl(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + @overload + def forward(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: """ - Forward pass implementation of the GPT2LLM module. + Forward pass of the GPT2LLM module. Args: inputs (dict[str, torch.Tensor]): A dictionary containing input tensors. @@ -893,72 +893,50 @@ def forward_impl(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tenso dict[str, torch.Tensor]: A dictionary containing output tensors. - prediction_key (str): Key for the output tensor containing logits. """ - input_ids = inputs[self.sample_key] - device = input_ids.device - _, t = input_ids.size() # batch size, sequence length - assert t <= self.sequence_length, f"Cannot forward sequence of length {t}, the model's maximum " - f"input sequence length is only {self.sequence_length}" - - # forward the GPT model itself - tok_emb = self.transformer.wte(input_ids) # token embeddings of shape (b, t, n_embd) - - if self.poe_type is PositionTypes.ABSOLUTE: - pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t) - pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd) - tok_emb = tok_emb + pos_emb - - # TODO: use drop out also without absolute position embedding? - x = self.transformer.drop(tok_emb) - - for block in self.transformer.h: - x = block(x) - x = self.transformer.lm_head_norm(x) - logits = self.transformer.lm_head(x) - return {self.prediction_key: logits} + ... - def forward(self, inputs: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: + @overload + def forward(self, inputs: torch.Tensor) -> torch.Tensor: """ Forward pass of the GPT2LLM module. Args: - inputs (dict[str, torch.Tensor]): A dictionary containing input tensors. - - sample_key (str): Key for the input tensor containing token ids. + inputs (torch.Tensor): A tensor containing input token ids. Returns: - dict[str, torch.Tensor]: A dictionary containing output tensors. - - prediction_key (str): Key for the output tensor containing logits. + torch.Tensor: A tensor containing output logits. """ - return self.forward_impl(inputs) + ... - -class GPT2LLMPP(GPT2LLM): - """GPT2LLM class.""" + def forward(self, inputs: dict[str, torch.Tensor] | torch.Tensor) -> dict[str, torch.Tensor] | torch.Tensor: + if isinstance(inputs, dict): + return {self.prediction_key: self.forward_impl(inputs[self.sample_key])} + else: + return self.forward_impl(inputs) def forward_impl(self, inputs: torch.Tensor) -> torch.Tensor: """ Forward pass implementation of the GPT2LLM module. Args: - inputs (dict[str, torch.Tensor]): A dictionary containing input tensors. - - sample_key (str): Key for the input tensor containing token ids. + inputs (torch.Tensor): A tensor containing input token ids. Returns: - dict[str, torch.Tensor]: A dictionary containing output tensors. - - prediction_key (str): Key for the output tensor containing logits. + torch.Tensor: A tensor containing output logits. """ device = inputs.device - t = inputs.size(1) # batch size, sequence length - assert t <= self.sequence_length, f"Cannot forward sequence of length {t}, the model's maximum " - f"input sequence length is only {self.sequence_length}" + seq_len = inputs.size(1) + assert seq_len <= self.sequence_length, f"Cannot forward sequence of length {seq_len}, the model's maximum " + f"input sequence length is only {self.sequence_length}." # forward the GPT model itself h = ( self.transformer.wte(inputs) if hasattr(self.transformer, "wte") else inputs - ) # token embeddings of shape (b, t, n_embd) + ) # token embeddings of shape (b, seq_len, n_embd) if self.poe_type is PositionTypes.ABSOLUTE and hasattr(self.transformer, "wpe"): - pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t) - pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd) + pos = torch.arange(0, seq_len, dtype=torch.long, device=device) # shape (seq_len) + pos_emb = self.transformer.wpe(pos) # position embeddings of shape (seq_len, n_embd) h = h + pos_emb # TODO: use drop out also without absolute position embedding? @@ -970,20 +948,6 @@ def forward_impl(self, inputs: torch.Tensor) -> torch.Tensor: h = self.transformer.lm_head(h) if hasattr(self.transformer, "lm_head") else h return h - def forward(self, inputs: torch.Tensor) -> torch.Tensor: - """ - Forward pass of the GPT2LLM module. - - Args: - inputs (dict[str, torch.Tensor]): A dictionary containing input tensors. - - sample_key (str): Key for the input tensor containing token ids. - - Returns: - dict[str, torch.Tensor]: A dictionary containing output tensors. - - prediction_key (str): Key for the output tensor containing logits. - """ - return self.forward_impl(inputs) - def manual_scaled_dot_product_attention( query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None diff --git a/src/modalities/models/model_factory.py b/src/modalities/models/model_factory.py index dc3d084a8..877c9cbdc 100644 --- a/src/modalities/models/model_factory.py +++ b/src/modalities/models/model_factory.py @@ -28,7 +28,6 @@ from modalities.exceptions import ModelStateError from modalities.models.gpt2.gpt2_model import ( GPT2LLM, - GPT2LLMPP, AttentionConfig, AttentionImplementation, LayerNormWrapperConfig, @@ -569,7 +568,6 @@ def get_gpt2_model( use_weight_tying: bool, use_meta_device: Optional[bool] = False, seed: int = None, - use_pp: Optional[bool] = False, ) -> GPT2LLM: config = dict( sample_key=sample_key, @@ -599,12 +597,11 @@ def get_gpt2_model( "Please set at least use_meta_device=False or use_weight_tying=False." "https://github.com/Modalities/modalities/issues/357" ) - gpt2_model_class = GPT2LLMPP if use_pp else GPT2LLM if use_meta_device: with torch.device("meta"): - model = gpt2_model_class(**config) + model = GPT2LLM(**config) else: - model = gpt2_model_class(**config) + model = GPT2LLM(**config) return model @staticmethod diff --git a/src/modalities/registry/components.py b/src/modalities/registry/components.py index b3a0a7618..167a29894 100644 --- a/src/modalities/registry/components.py +++ b/src/modalities/registry/components.py @@ -78,7 +78,7 @@ ProgressSubscriberFactory, ResultsSubscriberFactory, ) -from modalities.loss_functions import CLMCrossEntropyLoss, CLMCrossEntropyLossPP +from modalities.loss_functions import CLMCrossEntropyLoss from modalities.models.coca.coca_model import CoCa, CoCaConfig from modalities.models.coca.collator import CoCaCollateFnConfig, CoCaCollatorFn from modalities.models.components.layer_norms import LayerNormConfig, RMSLayerNorm, RMSLayerNormConfig @@ -200,7 +200,6 @@ class ComponentEntity: ), # losses ComponentEntity("loss", "clm_cross_entropy_loss", CLMCrossEntropyLoss, CLMCrossEntropyLossConfig), - ComponentEntity("loss", "clm_cross_entropy_loss_pp", CLMCrossEntropyLossPP, CLMCrossEntropyLossConfig), # optmizers ComponentEntity("optimizer", "adam", OptimizerFactory.get_adam, AdamOptimizerConfig), ComponentEntity("optimizer", "adam_w", OptimizerFactory.get_adam_w, AdamWOptimizerConfig), diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml index 5ef5f148a..0ceb02a53 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml +++ b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml @@ -16,7 +16,7 @@ settings: loss_fn: component_key: loss - variant_key: clm_cross_entropy_loss_pp + variant_key: clm_cross_entropy_loss config: target_key: ${settings.referencing_keys.target_key} prediction_key: ${settings.referencing_keys.prediction_key} @@ -129,7 +129,6 @@ model_raw: component_key: model variant_key: gpt2 config: - use_pp: true use_meta_device: true use_weight_tying: false sample_key: ${settings.referencing_keys.sample_key} From 5147a7ac98288f9c4163e7773b427f058b200ac3 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Mon, 8 Sep 2025 12:31:43 +0200 Subject: [PATCH 17/67] refactor(parallelism): Clean up for pp test. --- .../test_pp_fwd_bwd_pass.py | 51 +++++++++++-------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py index 7384b8338..f933f4289 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py +++ b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py @@ -109,40 +109,34 @@ def _test_pp_impl( world_size=world_size, rdvz_port=22356, ): - components = self._get_components(pp_model_config_path, use_pp=True) - scheduled_pipeline = components.scheduled_pipeline vocab_size = 50304 sequence_length = 256 batch_size = 4 sequences = torch.randint(0, vocab_size, (batch_size, sequence_length)) targets = sequences[:, 1:].contiguous() inputs = sequences[:, :-1].contiguous() - loss_pp = self._forward_step(scheduled_pipeline, inputs, targets) - - # if scheduled_pipeline.is_last_pp_stage: - working_dir = Path(os.path.dirname(__file__)) - fsdp2_model_config_path = working_dir / "configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml" - fsdp2_components = self._get_components(fsdp2_model_config_path, use_pp=False) - fsdp2_model = fsdp2_components.initialized_model - fsdp2_loss_fn = fsdp2_components.loss_fn - - input_dict = {"input_ids": inputs} - fsdp2_out = fsdp2_model(input_dict) - forward_batch = InferenceResultBatch(predictions=fsdp2_out, targets={fsdp2_loss_fn.target_key: targets}) - fsdp2_loss = fsdp2_loss_fn(forward_batch) - if scheduled_pipeline.is_last_pp_stage: - assert torch.allclose(fsdp2_loss, loss_pp, atol=1e-6, rtol=1e-5), "Outputs do not match" + + is_last_pp_stage, loss_pp = self._forward_step_with_pp(pp_model_config_path, inputs, targets) + fsdp2_loss = self._forward_step_without_pp(inputs, targets) + + if is_last_pp_stage: + assert torch.allclose(loss_pp, fsdp2_loss, atol=1e-6, rtol=1e-5), "Losses do not match" + + def _forward_step_with_pp( + self, pp_model_config_path: Path, inputs: torch.Tensor, targets: torch.Tensor + ) -> tuple[bool, torch.Tensor]: + components = self._get_components(pp_model_config_path, use_pp=True) + scheduled_pipeline = components.scheduled_pipeline + loss_pp = self._forward_step(scheduled_pipeline, inputs, targets) + return scheduled_pipeline.is_last_pp_stage, loss_pp def _forward_step(self, scheduled_pipeline: Pipeline, inputs: torch.Tensor, targets: torch.Tensor): """Runs a forward step on the model.""" pp_schedule = scheduled_pipeline.pp_schedule targets, losses = (targets, []) if scheduled_pipeline.is_last_pp_stage else (None, None) - if scheduled_pipeline.is_first_pp_stage: # first stage - # pp_schedule.step(inputs, target=targets, losses=losses, input_batch=inputs) + if scheduled_pipeline.is_first_pp_stage: pp_schedule.step(inputs, target=targets, losses=losses) - else: # non-first stage - # pp_schedule.step(target=targets, losses=losses, input_batch=inputs) - # pp_schedule.step(inputs, target=targets, losses=losses, input_batch=inputs) + else: pp_schedule.step(target=targets, losses=losses) # accumulate losses across pipeline microbatches @@ -152,3 +146,16 @@ def _forward_step(self, scheduled_pipeline: Pipeline, inputs: torch.Tensor, targ if scheduled_pipeline.is_last_pp_stage else torch.tensor([-1.0], device=inputs.device) ) + + def _forward_step_without_pp(self, inputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor: + working_dir = Path(os.path.dirname(__file__)) + fsdp2_model_config_path = working_dir / "configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml" + fsdp2_components = self._get_components(fsdp2_model_config_path, use_pp=False) + fsdp2_model = fsdp2_components.initialized_model + fsdp2_loss_fn = fsdp2_components.loss_fn + + input_dict = {"input_ids": inputs} + fsdp2_out = fsdp2_model(input_dict) + forward_batch = InferenceResultBatch(predictions=fsdp2_out, targets={fsdp2_loss_fn.target_key: targets}) + fsdp2_loss = fsdp2_loss_fn(forward_batch) + return fsdp2_loss From 1cb977954daaea91b72dd878b4693a9d8d7dbd64 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Mon, 8 Sep 2025 13:28:34 +0200 Subject: [PATCH 18/67] test: Print losses to debug tests --- .../pipeline_parallelism/test_pp_fwd_bwd_pass.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py index f933f4289..b9bb462fd 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py +++ b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py @@ -120,6 +120,7 @@ def _test_pp_impl( fsdp2_loss = self._forward_step_without_pp(inputs, targets) if is_last_pp_stage: + print(f"Loss with PP: {loss_pp.item()}, Loss without PP: {fsdp2_loss.item()}") assert torch.allclose(loss_pp, fsdp2_loss, atol=1e-6, rtol=1e-5), "Losses do not match" def _forward_step_with_pp( From 27ad56dffd5fa5b6d1a065d937493ad1412796f6 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Tue, 9 Sep 2025 11:12:00 +0200 Subject: [PATCH 19/67] feat: Use scheduled_pipeline for forwad backward pass --- src/modalities/config/instantiation_models.py | 2 ++ src/modalities/main.py | 1 + 2 files changed, 3 insertions(+) diff --git a/src/modalities/config/instantiation_models.py b/src/modalities/config/instantiation_models.py index 4c57c133d..578641a64 100644 --- a/src/modalities/config/instantiation_models.py +++ b/src/modalities/config/instantiation_models.py @@ -13,6 +13,7 @@ PydanticLossIFType, PydanticMessageSubscriberIFType, PydanticMFUCalculatorABCType, + PydanticPipelineType, PydanticPytorchDeviceType, PydanticPytorchModuleType, PydanticTextInferenceComponentType, @@ -178,6 +179,7 @@ def _check_last_step_checkpointed(self) -> "TrainingComponentsInstantiationModel checkpoint_saving: PydanticCheckpointSavingIFType gradient_clipper: PydanticGradientClipperIFType mfu_calculator: Optional[PydanticMFUCalculatorABCType] = None + scheduled_pipeline: Optional[PydanticPipelineType] = None model_raw: PydanticPytorchModuleType @model_validator(mode="after") diff --git a/src/modalities/main.py b/src/modalities/main.py index d995b9168..271a6759a 100644 --- a/src/modalities/main.py +++ b/src/modalities/main.py @@ -169,6 +169,7 @@ def run(self, components: TrainingComponentsInstantiationModel): checkpointing_interval_in_steps=components.settings.intervals.checkpointing_interval_in_steps, evaluation_interval_in_steps=components.settings.intervals.evaluation_interval_in_steps, training_log_interval_in_steps=components.settings.intervals.training_log_interval_in_steps, + scheduled_pipeline=components.scheduled_pipeline if components.scheduled_pipeline else None, ) def get_logging_publishers( From 41c4f36d3760e5213a77e2ec4a383203414323ac Mon Sep 17 00:00:00 2001 From: rrutmann Date: Tue, 9 Sep 2025 14:32:12 +0200 Subject: [PATCH 20/67] feat: Use scheduled_pipeline for training --- src/modalities/gym.py | 7 ++++++- src/modalities/trainer.py | 39 +++++++++++++++++++++++++++++++-------- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/src/modalities/gym.py b/src/modalities/gym.py index 0394b7a28..65b29fab8 100644 --- a/src/modalities/gym.py +++ b/src/modalities/gym.py @@ -40,6 +40,7 @@ def run( train_data_loader: LLMDataLoader, evaluation_data_loaders: list[LLMDataLoader], checkpoint_saving: CheckpointSaving, + scheduled_pipeline=None, # TODO set type ): """Runs the model training, including evaluation and checkpointing. @@ -57,6 +58,7 @@ def run( model=app_state.model, evaluation_data_loaders=evaluation_data_loaders, evaluation_interval_in_steps=evaluation_interval_in_steps, + scheduled_pipeline=scheduled_pipeline, ) checkpointing_callback: Callable[[TrainingProgress], None] = partial( @@ -74,6 +76,7 @@ def run( evaluation_callback=evaluation_callback, checkpointing_callback=checkpointing_callback, training_log_interval_in_steps=training_log_interval_in_steps, + scheduled_pipeline=scheduled_pipeline, ) print_rank_0(f"Training done at {datetime.now()}.") @@ -101,11 +104,13 @@ def _run_evaluation( num_train_steps_done: int, evaluation_data_loaders: list[LLMDataLoader], evaluation_interval_in_steps: int, + scheduled_pipeline=None, # TODO set type ): - if num_train_steps_done % evaluation_interval_in_steps == 0: + if num_train_steps_done % evaluation_interval_in_steps == 0 and num_train_steps_done > 10: self.evaluator.evaluate( model=model, data_loaders=evaluation_data_loaders, loss_fun=self.loss_fun, num_train_steps_done=num_train_steps_done, + scheduled_pipeline=scheduled_pipeline, ) diff --git a/src/modalities/trainer.py b/src/modalities/trainer.py index b443c0ad3..55213cf9f 100644 --- a/src/modalities/trainer.py +++ b/src/modalities/trainer.py @@ -95,6 +95,7 @@ def _train_batch( scheduler: LRScheduler, loss_fun: Loss, micro_batch_id: int, + scheduled_pipeline=None, # TODO set type ) -> tuple[bool, int, torch.Tensor, Optional[torch.Tensor]]: """ Conducts a training step on batch of data. @@ -116,9 +117,27 @@ def _train_batch( - gradient_norm_score (Optional[torch.Tensor]): The gradient norm score, if a training step was performed otherwise return None. """ - result_batch = model_predict_batch(model=model, batch=batch) - loss = loss_fun(result_batch) - (loss / self.gradient_acc_steps).backward() + if scheduled_pipeline is not None: + pp_schedule = scheduled_pipeline.pp_schedule + # TODO: handle loss and backward in pp + # Pipeline Parallel forward / backward inside step() call + # with self.train_context(optional_context_parallel_ctx): + targets, losses = ( + (batch.targets[loss_fun.target_key].contiguous(), []) + if scheduled_pipeline.is_last_pp_stage + else (None, None) + ) + + if scheduled_pipeline.is_first_pp_stage: + pp_schedule.step(batch.samples[model.sample_key].contiguous(), target=targets, losses=losses) + else: + pp_schedule.step(target=targets, losses=losses) + loss = torch.mean(torch.stack(losses)).to(losses[0].device) if scheduled_pipeline.is_last_pp_stage else None + else: + # else continue with loss calculation + result_batch = model_predict_batch(model=model, batch=batch) + loss = loss_fun(result_batch) + (loss / self.gradient_acc_steps).backward() if (micro_batch_id + 1) % self.gradient_acc_steps == 0: gradient_norm_score = self.gradient_clipper.clip_gradients() @@ -143,6 +162,7 @@ def train( training_log_interval_in_steps: int, evaluation_callback: Callable[[TrainingProgress], None], checkpointing_callback: Callable[[TrainingProgress], None], + scheduled_pipeline=None, # TODO set type ): """ Trains the model. @@ -206,15 +226,17 @@ def train( scheduler=lr_scheduler, loss_fun=loss_fun, micro_batch_id=micro_batch_id, + scheduled_pipeline=scheduled_pipeline, ) forward_backward_time_recorder.stop() training_progress.num_seen_steps_current_run = num_train_steps_done training_progress.num_seen_tokens_current_run = self.global_num_tokens_per_train_step * num_train_steps_done - # Save the batch loss - cumulated_losses[0] += batch_loss.item() - # This works, because we always drop the last batch in case it has less samples than the batch size - cumulated_losses[-1] += 1 # number of local batches + if batch_loss is not None: + # Save the batch loss + cumulated_losses[0] += batch_loss.item() + # This works, because we always drop the last batch in case it has less samples than the batch size + cumulated_losses[-1] += 1 # number of local batches # gradient norm is already synced across all ranks if gradient_norm_score is not None: @@ -243,7 +265,8 @@ def train( synced_num_samples_per_second = synced_num_samples / synced_forward_backward_time # TODO: insert reducer from outside so Trainer is independent of FSDP # add the loss and gradient norm for the LAST batch - cumulated_losses[1] = batch_loss.item() + + cumulated_losses[1] = batch_loss.item() if batch_loss is not None else 0.0 reduced_losses = Reducer.reduce( tensor=cumulated_losses, From 6f3d5da3a11573c2017f5670218631303a48df26 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Tue, 9 Sep 2025 17:48:24 +0200 Subject: [PATCH 21/67] feat: Use scheduled_pipe in evaluation --- src/modalities/evaluator.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/modalities/evaluator.py b/src/modalities/evaluator.py index 456fcb47f..5d56bb90c 100644 --- a/src/modalities/evaluator.py +++ b/src/modalities/evaluator.py @@ -36,6 +36,7 @@ def evaluate_batch( batch: DatasetBatch, model: nn.Module, loss_fun: Callable[[InferenceResultBatch], torch.Tensor], + scheduled_pipeline=None, # TODO set type ) -> torch.Tensor: """Evaluate a single batch by forwarding it through the model and calculating the loss. @@ -48,8 +49,26 @@ def evaluate_batch( torch.Tensor: The loss of the batch """ with torch.no_grad(): - result_batch = model_predict_batch(model=model, batch=batch) - loss = loss_fun(result_batch) + if scheduled_pipeline is not None: + pp_schedule = scheduled_pipeline.pp_schedule + targets, losses = ( + (batch.targets[loss_fun.target_key].contiguous(), []) + if scheduled_pipeline.is_last_pp_stage + else (None, None) + ) + + if scheduled_pipeline.is_first_pp_stage: + pp_schedule.eval(batch.samples[model.sample_key].contiguous(), target=targets, losses=losses) + else: + pp_schedule.eval(target=targets, losses=losses) + loss = ( + torch.mean(torch.stack(losses)).to(losses[0].device) + if scheduled_pipeline.is_last_pp_stage + else None + ) + else: + result_batch = model_predict_batch(model=model, batch=batch) + loss = loss_fun(result_batch) return loss def evaluate( @@ -58,6 +77,7 @@ def evaluate( data_loaders: list[LLMDataLoader], loss_fun: Callable[[InferenceResultBatch], torch.Tensor], num_train_steps_done: int, + scheduled_pipeline=None, # TODO set type ) -> dict[str, EvaluationResultBatch]: """Evaluate the model on a set of datasets. @@ -90,10 +110,12 @@ def evaluate( batch=batch, model=model, loss_fun=loss_fun, + scheduled_pipeline=scheduled_pipeline, ) - cumulated_loss[0] += batch_loss.item() # sum up batch loss - cumulated_loss[1] += 1 + if batch_loss is not None: + cumulated_loss[0] += batch_loss.item() # sum up batch loss + cumulated_loss[1] += 1 batch_length_tensor = torch.tensor(len(batch)).to(device) thoughput_aggregator.add_value(key=ThroughputAggregationKeys.NUM_SAMPLES, value=batch_length_tensor) From 9b853340f207b0c728d98af9be33249378726d58 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Tue, 9 Sep 2025 17:49:31 +0200 Subject: [PATCH 22/67] test: Print losses if test fails --- .../pipeline_parallelism/test_pp_fwd_bwd_pass.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py index b9bb462fd..47e1fe990 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py +++ b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py @@ -120,8 +120,9 @@ def _test_pp_impl( fsdp2_loss = self._forward_step_without_pp(inputs, targets) if is_last_pp_stage: - print(f"Loss with PP: {loss_pp.item()}, Loss without PP: {fsdp2_loss.item()}") - assert torch.allclose(loss_pp, fsdp2_loss, atol=1e-6, rtol=1e-5), "Losses do not match" + assert torch.allclose( + loss_pp, fsdp2_loss, atol=1e-6, rtol=1e-5 + ), f"Losses do not match.\nLoss with PP: {loss_pp.item()}, Loss without PP: {fsdp2_loss.item()}" def _forward_step_with_pp( self, pp_model_config_path: Path, inputs: torch.Tensor, targets: torch.Tensor From 84e2702627bc594fb1285ca8cc6f0d9a6af6a9f5 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Tue, 9 Sep 2025 17:50:03 +0200 Subject: [PATCH 23/67] chore: Run evaluation before training --- src/modalities/gym.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/modalities/gym.py b/src/modalities/gym.py index 65b29fab8..1e6fadf5a 100644 --- a/src/modalities/gym.py +++ b/src/modalities/gym.py @@ -106,7 +106,7 @@ def _run_evaluation( evaluation_interval_in_steps: int, scheduled_pipeline=None, # TODO set type ): - if num_train_steps_done % evaluation_interval_in_steps == 0 and num_train_steps_done > 10: + if num_train_steps_done % evaluation_interval_in_steps == 0: self.evaluator.evaluate( model=model, data_loaders=evaluation_data_loaders, From 32fbe9499c6a27c7f6e639cfdb8687207df0e4df Mon Sep 17 00:00:00 2001 From: rrutmann Date: Tue, 9 Sep 2025 17:50:55 +0200 Subject: [PATCH 24/67] chore: Increase microbatch size --- config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml b/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml index 381550a20..5d5557e6f 100644 --- a/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml +++ b/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml @@ -24,7 +24,7 @@ settings: enforce_last_step_checkpointed: false step_profile: gradient_accumulation_steps: 1 - local_train_micro_batch_size: 2 + local_train_micro_batch_size: 4 sequence_length: 256 training_target: num_target_tokens: @@ -222,7 +222,7 @@ scheduled_pipeline: pass_type: BY_REFERENCE pp_schedule_name: gpipe batch_size: ${settings.step_profile.local_train_micro_batch_size} - microbatch_size: 1 + microbatch_size: 2 pp_degree: ${device_mesh.config.pipeline_parallel_degree} # maybe better to use the fsdp model and the schedule here # instead of passing in the staged pipeline? From 61ab3114b4305890a642497c12bc6afe82e678f2 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Wed, 10 Sep 2025 17:24:36 +0200 Subject: [PATCH 25/67] fix: Use dp size instead of world size for last batch aggregation --- src/modalities/config/instantiation_models.py | 2 ++ src/modalities/main.py | 3 +++ .../running_env/fsdp/device_mesh.py | 22 +++++++++++++++++++ src/modalities/trainer.py | 6 ++++- 4 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/modalities/config/instantiation_models.py b/src/modalities/config/instantiation_models.py index 578641a64..6e3b12d8a 100644 --- a/src/modalities/config/instantiation_models.py +++ b/src/modalities/config/instantiation_models.py @@ -8,6 +8,7 @@ PydanticAppStateType, PydanticCheckpointSavingIFType, PydanticDatasetIFType, + PydanticDeviceMeshIFType, PydanticGradientClipperIFType, PydanticLLMDataLoaderIFType, PydanticLossIFType, @@ -180,6 +181,7 @@ def _check_last_step_checkpointed(self) -> "TrainingComponentsInstantiationModel gradient_clipper: PydanticGradientClipperIFType mfu_calculator: Optional[PydanticMFUCalculatorABCType] = None scheduled_pipeline: Optional[PydanticPipelineType] = None + device_mesh: PydanticDeviceMeshIFType model_raw: PydanticPytorchModuleType @model_validator(mode="after") diff --git a/src/modalities/main.py b/src/modalities/main.py index 271a6759a..f64ea16bf 100644 --- a/src/modalities/main.py +++ b/src/modalities/main.py @@ -20,6 +20,7 @@ from modalities.logging_broker.subscriber import MessageSubscriberIF from modalities.registry.components import COMPONENTS from modalities.registry.registry import Registry +from modalities.running_env.fsdp.device_mesh import get_num_data_parallel_ranks from modalities.trainer import Trainer from modalities.util import get_synced_experiment_id_of_run, get_total_number_of_trainable_parameters, print_rank_0 @@ -116,6 +117,7 @@ def run(self, components: TrainingComponentsInstantiationModel): * components.settings.step_profile.gradient_accumulation_steps * components.settings.cuda_env.world_size ) + num_data_parallel_ranks = get_num_data_parallel_ranks(components.device_mesh) trainer = Trainer( global_rank=components.settings.cuda_env.global_rank, progress_publisher=progress_publisher, @@ -128,6 +130,7 @@ def run(self, components: TrainingComponentsInstantiationModel): gradient_clipper=components.gradient_clipper, global_num_tokens_per_train_step=global_num_tokens_per_train_step, mfu_calculator=components.mfu_calculator, + num_data_parallel_ranks=num_data_parallel_ranks, ) # Evaluator diff --git a/src/modalities/running_env/fsdp/device_mesh.py b/src/modalities/running_env/fsdp/device_mesh.py index 24e7d6e18..c74751362 100644 --- a/src/modalities/running_env/fsdp/device_mesh.py +++ b/src/modalities/running_env/fsdp/device_mesh.py @@ -127,3 +127,25 @@ def get_device_mesh( # TODO: Torch Titan had some more checks here. We need to check if we also need those: # https://github.com/pytorch/torchtitan/blob/b291ad662493b63d25b038a30a915082d3617baf/torchtitan/distributed/parallel_dims.py#L86-L104 return device_mesh + + +def get_num_data_parallel_ranks(device_mesh: DeviceMesh) -> int: + """Gets the number of data parallel ranks from the device mesh. + + Args: + device_mesh (DeviceMesh): The device mesh. + + Returns: + int: The number of data parallel ranks. + """ + world_size = device_mesh.size() + dp_size = world_size + for parallelism_degree in ( + ParallelismDegrees.TP.value, + ParallelismDegrees.PP.value, + ParallelismDegrees.CP.value, + ): + if parallelism_degree in device_mesh.mesh_dim_names: + dp_size //= device_mesh.size(device_mesh.mesh_dim_names.index(parallelism_degree)) + + return dp_size diff --git a/src/modalities/trainer.py b/src/modalities/trainer.py index 55213cf9f..9960920ba 100644 --- a/src/modalities/trainer.py +++ b/src/modalities/trainer.py @@ -30,6 +30,7 @@ class Trainer: def __init__( self, global_rank: int, + num_data_parallel_ranks: int, progress_publisher: MessagePublisher[ProgressUpdate], evaluation_result_publisher: MessagePublisher[EvaluationResultBatch], gradient_acc_steps: int, @@ -62,6 +63,7 @@ def __init__( None """ self.global_rank = global_rank + self.num_data_parallel_ranks = num_data_parallel_ranks self.progress_publisher = progress_publisher self.evaluation_result_publisher = evaluation_result_publisher self.gradient_acc_steps = gradient_acc_steps @@ -273,7 +275,9 @@ def train( operation=dist.ReduceOp.SUM, # 1.) summed batch loss / (num batches * world size) # 2.) last batch loss / world size - post_processing_fun=lambda t: torch.stack([t[0] / t[-1], t[1] / dist.get_world_size()]), + post_processing_fun=lambda t: torch.stack( + [t[0] / t[-1], t[1] / self.num_data_parallel_ranks, t[-1]] + ), ) train_loss_avg, train_loss_last_batch = ( From 6952bcc1156b85fb24e6b06fc6735584dce1be45 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Wed, 10 Sep 2025 17:25:09 +0200 Subject: [PATCH 26/67] docs: Add TODOs for later check --- src/modalities/util.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/modalities/util.py b/src/modalities/util.py index eee5ff108..42b1360ac 100644 --- a/src/modalities/util.py +++ b/src/modalities/util.py @@ -186,9 +186,11 @@ def get_total_number_of_trainable_parameters(model: FSDPX) -> Number: # However, users can also provide their own sharding process groups (currently not supported in Modalities) # which would require to adapt the code. if model.sharding_strategy.name == "NO_SHARD": - sharding_factor = dist.get_world_size() + sharding_factor = dist.get_world_size() # TODO Check if we should use number of data parallel ranks instead if model.sharding_strategy.name == "HYBRID_SHARD": - sharding_factor = dist.get_world_size() // torch.cuda.device_count() + sharding_factor = ( + dist.get_world_size() // torch.cuda.device_count() + ) # TODO Check if we should use number of data parallel ranks instead elif model.sharding_strategy.name == "FULL_SHARD": sharding_factor = 1 total_num_params = total_num_params // sharding_factor From 90dbe51c22df6769ad196cb014b1fed67732f355 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Wed, 10 Sep 2025 17:33:13 +0200 Subject: [PATCH 27/67] fix: Train before evaluation so that pp is initialized for backwards --- src/modalities/gym.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/modalities/gym.py b/src/modalities/gym.py index 1e6fadf5a..30f7a820f 100644 --- a/src/modalities/gym.py +++ b/src/modalities/gym.py @@ -106,7 +106,7 @@ def _run_evaluation( evaluation_interval_in_steps: int, scheduled_pipeline=None, # TODO set type ): - if num_train_steps_done % evaluation_interval_in_steps == 0: + if num_train_steps_done > 0 and num_train_steps_done % evaluation_interval_in_steps == 0: self.evaluator.evaluate( model=model, data_loaders=evaluation_data_loaders, From 49df7d616ca43d36a4ff8198c03b23cb98556869 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Fri, 12 Sep 2025 14:28:40 +0200 Subject: [PATCH 28/67] fix: Add missing parameter seed to GPT2LLMConfig --- src/modalities/models/gpt2/gpt2_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py index bdbd49913..1de776101 100644 --- a/src/modalities/models/gpt2/gpt2_model.py +++ b/src/modalities/models/gpt2/gpt2_model.py @@ -319,7 +319,7 @@ class GPT2LLMConfig(BaseModel): ffn_norm_config (LayerNormWrapperConfig): Config for normalization of the feed-forward network. lm_head_norm_config (LayerNormWrapperConfig): Config for normalization of the language model head. use_weight_tying (bool): Whether to use weight tying. - + seed (int, optional): The seed for random number generation. Defaults to None. """ sample_key: str @@ -344,6 +344,7 @@ class GPT2LLMConfig(BaseModel): ffn_norm_config: LayerNormWrapperConfig lm_head_norm_config: LayerNormWrapperConfig use_weight_tying: bool + seed: Optional[int] = None @model_validator(mode="after") def check_divisibility(self) -> "GPT2LLMConfig": From 7996a299609dc4b78d122be8581cef971119c742 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Mon, 15 Sep 2025 10:38:13 +0200 Subject: [PATCH 29/67] fix: Retrieve all PP ranks for gradient clipping --- .../fsdp_gradient_clipper.py | 54 ++++++++++++++++--- .../fsdp_gradient_clipper_config.py | 4 +- 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py index f1adddfb3..d4b280a32 100644 --- a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py +++ b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py @@ -1,11 +1,15 @@ +import math from typing import Iterable, Optional import torch +from torch import distributed as dist +from torch.distributed.device_mesh import DeviceMesh from torch.distributed.fsdp import FSDPModule as FSDP2 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP1 from torch.distributed.tensor import DTensor from modalities.config.lookup_enum import LookupEnum +from modalities.running_env.fsdp.device_mesh import ParallelismDegrees from modalities.training.gradient_clipping.gradient_clipper import GradientClipperIF @@ -91,7 +95,13 @@ def clip_gradients(self) -> torch.Tensor: class FSDP2GradientClipper(GradientClipperIF): """The FSDP2GradientClipper class that is responsible for clipping the gradients of a model wrapped with FSDP.""" - def __init__(self, wrapped_model: FSDP2, max_norm: float, norm_type=GradientClippingMode) -> None: + def __init__( + self, + wrapped_model: FSDP2, + max_norm: float, + norm_type=GradientClippingMode, + device_mesh: Optional[DeviceMesh] = None, + ) -> None: """ Initialize the FSDP2GradientClipper object. @@ -106,6 +116,7 @@ def __init__(self, wrapped_model: FSDP2, max_norm: float, norm_type=GradientClip self.wrapped_model = wrapped_model self.max_norm = max_norm self.norm_type = norm_type + self.device_mesh = device_mesh @torch.no_grad() def clip_gradients(self) -> torch.Tensor: @@ -121,6 +132,7 @@ def clip_gradients(self) -> torch.Tensor: norm_type=self.norm_type.value, error_if_nonfinite=True, foreach=True, + device_mesh=self.device_mesh, ) return gradient_norm_score @@ -131,6 +143,7 @@ def clip_grad_norm_( norm_type: float = 2.0, error_if_nonfinite: bool = False, foreach: Optional[bool] = None, + device_mesh: Optional[DeviceMesh] = None, ) -> torch.Tensor: """ Clip the gradient norm of an iterable of parameters. @@ -138,10 +151,6 @@ def clip_grad_norm_( Gradient norm clipping requires computing the gradient norm over the entire model. `torch.nn.utils.clip_grad_norm_` only computes gradient norm along DP/FSDP/TP dimensions. - TODO: for pipeline parallelism, we need to implement it like here: - https://github.com/pytorch/torchtitan/blob/b291ad662493b63d25b038a30a915082d3617baf/torchtitan/distributed/utils.py#L245 - I removed all the code w.r.t. pipeline parallelism for now. - Args: parameters: an iterable of Tensors or a single Tensor that will have gradients normalized max_norm (float): max norm of the gradients @@ -154,6 +163,7 @@ def clip_grad_norm_( If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently fall back to the slow implementation for other device types. Default: ``None`` + device_mesh: device mesh Returns: Total norm of the parameter gradients (viewed as a single vector). @@ -172,11 +182,23 @@ def clip_grad_norm_( if isinstance(total_norm, DTensor): # Will reach here if any non-PP parallelism is used. # If only using PP, total_norm will be a local tensor. + total_norm = total_norm.full_tensor() - torch.nn.utils.clip_grads_with_norm_( - parameters=parameters, max_norm=max_norm, total_norm=total_norm, foreach=foreach + pp_mesh = ( + device_mesh[ParallelismDegrees.PP.value] + if device_mesh is not None and ParallelismDegrees.PP.value in device_mesh.mesh_dim_names + else None ) + if pp_mesh is not None: + if math.isinf(norm_type): + dist.all_reduce(total_norm, op=dist.ReduceOp.MAX, group=pp_mesh.get_group()) + else: + total_norm **= norm_type + dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=pp_mesh.get_group()) + total_norm **= 1.0 / norm_type + + torch.nn.utils.clip_grads_with_norm_(parameters, max_norm, total_norm, foreach) return total_norm @@ -184,7 +206,9 @@ class FSDP2LoggingOnlyGradientClipper(GradientClipperIF): """The FSDP2LoggingOnlyGradientClipper class that is responsible for logging the gradient norms without actually clipping the gradients.""" - def __init__(self, wrapped_model: FSDP2, norm_type=GradientClippingMode) -> None: + def __init__( + self, wrapped_model: FSDP2, norm_type=GradientClippingMode, device_mesh: Optional[DeviceMesh] = None + ) -> None: """ Initialize the FSDP2LoggingOnlyGradientClipper. @@ -197,6 +221,7 @@ def __init__(self, wrapped_model: FSDP2, norm_type=GradientClippingMode) -> None """ self.wrapped_model = wrapped_model self.norm_type = norm_type + self.device_mesh = device_mesh @torch.no_grad() def clip_gradients(self) -> torch.Tensor: @@ -214,6 +239,19 @@ def clip_gradients(self) -> torch.Tensor: # Will reach here if any non-PP parallelism is used. # If only using PP, total_norm will be a local tensor. total_norm = total_norm.full_tensor() + + pp_mesh = ( + self.device_mesh[ParallelismDegrees.PP.value] + if self.device_mesh is not None and ParallelismDegrees.PP.value in self.device_mesh.mesh_dim_names + else None + ) + if pp_mesh is not None: + if math.isinf(self.norm_type.value): + dist.all_reduce(total_norm, op=dist.ReduceOp.MAX, group=pp_mesh.get_group()) + else: + total_norm **= self.norm_type.value + dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=pp_mesh.get_group()) + total_norm **= 1.0 / self.norm_type.value return total_norm diff --git a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py index 4b4dd807d..500d954d8 100644 --- a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py +++ b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py @@ -2,7 +2,7 @@ from pydantic import BaseModel, Field -from modalities.config.pydantic_if_types import PydanticPytorchModuleType +from modalities.config.pydantic_if_types import PydanticDeviceMeshIFType, PydanticPytorchModuleType from modalities.training.gradient_clipping.fsdp_gradient_clipper import GradientClippingMode @@ -24,6 +24,7 @@ class FSDPGradientClipperConfig(BaseModel): max_norm: Annotated[float, Field(strict=True, gt=0)] norm_type: GradientClippingMode wrapped_model: PydanticPytorchModuleType + device_mesh: PydanticDeviceMeshIFType | None = None class FSDPDummyGradientClipperConfig(BaseModel): @@ -41,6 +42,7 @@ class FSDPDummyGradientClipperConfig(BaseModel): wrapped_model: PydanticPytorchModuleType norm_type: GradientClippingMode + device_mesh: PydanticDeviceMeshIFType | None = None class DummyGradientClipperConfig(BaseModel): From cbddcbc8089fac92f38630c5399612c7d5a2d185 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Mon, 15 Sep 2025 10:39:05 +0200 Subject: [PATCH 30/67] test: Add new parameter num_data_parallel_ranks to Trainer --- tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/conftest.py b/tests/conftest.py index bc92e004b..9bf289122 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -196,6 +196,7 @@ def trainer(progress_publisher_mock, gradient_clipper_mock): global_num_seen_tokens=0, num_target_tokens=100, num_target_steps=10, + num_data_parallel_ranks=int(os.getenv("WORLD_SIZE")), ) From 56a917aca238471aa2057ab50b36b92b1faf9dc7 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Mon, 15 Sep 2025 11:06:40 +0200 Subject: [PATCH 31/67] fix: Make FSDP1GradientClipperConfig independent of device_mesh --- src/modalities/registry/components.py | 14 ++++--- .../fsdp_gradient_clipper_config.py | 38 ++++++++++++++++++- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/modalities/registry/components.py b/src/modalities/registry/components.py index 167a29894..9a3a3c46a 100644 --- a/src/modalities/registry/components.py +++ b/src/modalities/registry/components.py @@ -112,8 +112,10 @@ ) from modalities.training.gradient_clipping.fsdp_gradient_clipper_config import ( DummyGradientClipperConfig, - FSDPDummyGradientClipperConfig, - FSDPGradientClipperConfig, + FSDP1DummyGradientClipperConfig, + FSDP1GradientClipperConfig, + FSDP2DummyGradientClipperConfig, + FSDP2GradientClipperConfig, ) from modalities.utils.mfu import GPT2MFUCalculator from modalities.utils.number_conversion import ( @@ -325,13 +327,13 @@ class ComponentEntity: ComponentEntity("layer_norm", "rms_norm", RMSLayerNorm, RMSLayerNormConfig), ComponentEntity("layer_norm", "layer_norm", nn.LayerNorm, LayerNormConfig), # gradient clippers - ComponentEntity("gradient_clipper", "fsdp1", FSDP1GradientClipper, FSDPGradientClipperConfig), + ComponentEntity("gradient_clipper", "fsdp1", FSDP1GradientClipper, FSDP1GradientClipperConfig), ComponentEntity( - "gradient_clipper", "fsdp1_logging_only", FSDP1LoggingOnlyGradientClipper, FSDPDummyGradientClipperConfig + "gradient_clipper", "fsdp1_logging_only", FSDP1LoggingOnlyGradientClipper, FSDP1DummyGradientClipperConfig ), - ComponentEntity("gradient_clipper", "fsdp2", FSDP2GradientClipper, FSDPGradientClipperConfig), + ComponentEntity("gradient_clipper", "fsdp2", FSDP2GradientClipper, FSDP2GradientClipperConfig), ComponentEntity( - "gradient_clipper", "fsdp2_logging_only", FSDP2LoggingOnlyGradientClipper, FSDPDummyGradientClipperConfig + "gradient_clipper", "fsdp2_logging_only", FSDP2LoggingOnlyGradientClipper, FSDP2DummyGradientClipperConfig ), ComponentEntity("gradient_clipper", "dummy", DummyGradientClipper, DummyGradientClipperConfig), # MFU calculators diff --git a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py index 500d954d8..80ebee2a8 100644 --- a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py +++ b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py @@ -6,7 +6,7 @@ from modalities.training.gradient_clipping.fsdp_gradient_clipper import GradientClippingMode -class FSDPGradientClipperConfig(BaseModel): +class FSDP1GradientClipperConfig(BaseModel): """ Configuration class for FSDP gradient clipper. @@ -24,10 +24,44 @@ class FSDPGradientClipperConfig(BaseModel): max_norm: Annotated[float, Field(strict=True, gt=0)] norm_type: GradientClippingMode wrapped_model: PydanticPytorchModuleType + + +class FSDP2GradientClipperConfig(FSDP1GradientClipperConfig): + """ + Configuration class for FSDP gradient clipper. + + Args: + max_norm (float): The maximum norm value for gradient clipping. + norm_type (GradientClippingMode): The type of gradient clipping to be applied. + wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model. + + Attributes: + max_norm (float): The maximum norm value for gradient clipping. + norm_type (GradientClippingMode): The type of gradient clipping to be applied. + wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model. + """ + device_mesh: PydanticDeviceMeshIFType | None = None -class FSDPDummyGradientClipperConfig(BaseModel): +class FSDP1DummyGradientClipperConfig(BaseModel): + """ + Configuration class for FSDP dummy gradient clipper. + + Args: + wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model. + norm_type (GradientClippingMode): The type of gradient clipping to be applied. + + Attributes: + wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model. + norm_type (GradientClippingMode): The type of gradient clipping to be applied. + """ + + wrapped_model: PydanticPytorchModuleType + norm_type: GradientClippingMode + + +class FSDP2DummyGradientClipperConfig(FSDP1DummyGradientClipperConfig): """ Configuration class for FSDP dummy gradient clipper. From eb47aa965f99a5cc02dafdb2ec7da6b879a9086d Mon Sep 17 00:00:00 2001 From: rrutmann Date: Mon, 15 Sep 2025 11:07:13 +0200 Subject: [PATCH 32/67] fix: Handle optional device_mesh correctly --- src/modalities/config/instantiation_models.py | 2 +- src/modalities/main.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/modalities/config/instantiation_models.py b/src/modalities/config/instantiation_models.py index 6e3b12d8a..c4bb105e1 100644 --- a/src/modalities/config/instantiation_models.py +++ b/src/modalities/config/instantiation_models.py @@ -181,7 +181,7 @@ def _check_last_step_checkpointed(self) -> "TrainingComponentsInstantiationModel gradient_clipper: PydanticGradientClipperIFType mfu_calculator: Optional[PydanticMFUCalculatorABCType] = None scheduled_pipeline: Optional[PydanticPipelineType] = None - device_mesh: PydanticDeviceMeshIFType + device_mesh: Optional[PydanticDeviceMeshIFType] = None model_raw: PydanticPytorchModuleType @model_validator(mode="after") diff --git a/src/modalities/main.py b/src/modalities/main.py index f64ea16bf..9836a4d8e 100644 --- a/src/modalities/main.py +++ b/src/modalities/main.py @@ -117,7 +117,10 @@ def run(self, components: TrainingComponentsInstantiationModel): * components.settings.step_profile.gradient_accumulation_steps * components.settings.cuda_env.world_size ) - num_data_parallel_ranks = get_num_data_parallel_ranks(components.device_mesh) + if components.device_mesh is None: + num_data_parallel_ranks = 1 + else: + num_data_parallel_ranks = get_num_data_parallel_ranks(components.device_mesh) trainer = Trainer( global_rank=components.settings.cuda_env.global_rank, progress_publisher=progress_publisher, From d228351f7e400f7966adbc33ddc1c9d581a497d8 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Wed, 17 Sep 2025 10:36:18 +0200 Subject: [PATCH 33/67] feat: Consider pipeline parallelism in tensor pallelization --- src/modalities/models/model_factory.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/modalities/models/model_factory.py b/src/modalities/models/model_factory.py index 877c9cbdc..95c72c338 100644 --- a/src/modalities/models/model_factory.py +++ b/src/modalities/models/model_factory.py @@ -631,7 +631,7 @@ def get_gpt2_tensor_parallelized_model(model: GPT2LLM, device_mesh: DeviceMesh) ), } - if isinstance(model.transformer.wpe, nn.Embedding): + if hasattr(model.transformer, "wpe") and isinstance(model.transformer.wpe, nn.Embedding): # If the position embedding is an nn.Embedding, we can shard it on the sequence dimension # to enable sequence parallelism in the downstream transformer blocks. # Note, for RoPE the wpe layer is an identity operation, which cannnot be sharded. @@ -640,11 +640,14 @@ def get_gpt2_tensor_parallelized_model(model: GPT2LLM, device_mesh: DeviceMesh) output_layouts=Shard(0), ) - parallelize_module( - module=model, - device_mesh=tp_mesh, - parallelize_plan=model_tp_plan, - ) + # only keep the relevant parts of the model parallel plan + model_tp_plan = {k: v for k, v in model_tp_plan.items() if hasattr(model.transformer, k.split(".")[1])} + if model_tp_plan: + parallelize_module( + module=model, + device_mesh=tp_mesh, + parallelize_plan=model_tp_plan, + ) transformer_block_tp_plan = { "attention_norm": SequenceParallel(), @@ -703,6 +706,16 @@ def get_gpt2_tensor_parallelized_model(model: GPT2LLM, device_mesh: DeviceMesh) ) transformer_block.attn.n_head_q = transformer_block.attn.n_head_q // tp_mesh.size() transformer_block.attn.n_head_kv = transformer_block.attn.n_head_kv // tp_mesh.size() + # only keep the relevant parts of the model parallel plan + transformer_block_tp_plan = { + k: v + for k, v in transformer_block_tp_plan.items() + if ( + hasattr(transformer_block, k) + or hasattr(transformer_block.attn, k.split(".")[1]) + or hasattr(transformer_block.mlp, k.split(".")[1]) + ) + } parallelize_module( module=transformer_block, device_mesh=tp_mesh, From 55dad72bf973fef5c20d113851165a3b6e4a445a Mon Sep 17 00:00:00 2001 From: rrutmann Date: Wed, 17 Sep 2025 10:58:57 +0200 Subject: [PATCH 34/67] test: Use the same data on each rank & test tensor parallelism --- .../test_pp_fwd_bwd_pass.py | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py index 47e1fe990..9014f164e 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py +++ b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py @@ -30,12 +30,11 @@ def temp_file_path() -> Path: class ComponentsInstantiationPPModel(BaseModel): - initialized_model: PydanticFSDP2ModuleType scheduled_pipeline: PydanticPipelineType class ComponentsInstantiationModel(BaseModel): - initialized_model: PydanticFSDP2ModuleType + fsdp_model: PydanticFSDP2ModuleType loss_fn: PydanticLossIFType @@ -48,7 +47,10 @@ def _get_tmp_sharding_config_path( self, sharding_degree: int, tp_degree: int, pp_degree: int, temp_file_path: Path ) -> Path: working_dir = Path(os.path.dirname(__file__)) - config_file_path = working_dir / "configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml" + if tp_degree > 1: + config_file_path = working_dir / "configs/config_lorem_ipsum_long_fsdp2_pp_tp_fwd_bwd_pass.yaml" + else: + config_file_path = working_dir / "configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml" with open(config_file_path, "r") as file: config_string = file.read() @@ -76,9 +78,9 @@ def _get_components(self, config_file_path: Path, use_pp: bool) -> ComponentsIns @pytest.mark.parametrize( "sharding_degree, tp_degree, pp_degree, world_size", [ - (2, 1, 2, 4), + # (2, 1, 2, 4), # (2, 1, 4, 8), - # (2, 2, 2, 8), # TODO need to support this case + (2, 2, 2, 8), # TODO need to support this case ], ) def test_pp(self, sharding_degree: int, tp_degree: int, pp_degree: int, world_size: int, temp_file_path: Path): @@ -107,11 +109,12 @@ def _test_pp_impl( global_rank=process_id, local_rank=process_id, world_size=world_size, - rdvz_port=22356, + rdvz_port=22359, ): vocab_size = 50304 - sequence_length = 256 + sequence_length = 4 batch_size = 4 + torch.manual_seed(42) sequences = torch.randint(0, vocab_size, (batch_size, sequence_length)) targets = sequences[:, 1:].contiguous() inputs = sequences[:, :-1].contiguous() @@ -127,13 +130,21 @@ def _test_pp_impl( def _forward_step_with_pp( self, pp_model_config_path: Path, inputs: torch.Tensor, targets: torch.Tensor ) -> tuple[bool, torch.Tensor]: - components = self._get_components(pp_model_config_path, use_pp=True) - scheduled_pipeline = components.scheduled_pipeline - loss_pp = self._forward_step(scheduled_pipeline, inputs, targets) + try: + components = self._get_components(pp_model_config_path, use_pp=True) + scheduled_pipeline = components.scheduled_pipeline + loss_pp = self._forward_step(scheduled_pipeline, inputs, targets) + except Exception as e: + import traceback + + print(f"Exception in _forward_step_with_pp: {e}") + traceback.print_exc() # <-- Add this line to print the full stack trace + raise e return scheduled_pipeline.is_last_pp_stage, loss_pp def _forward_step(self, scheduled_pipeline: Pipeline, inputs: torch.Tensor, targets: torch.Tensor): """Runs a forward step on the model.""" + os.environ["MODEL_TYPE"] = "PP" pp_schedule = scheduled_pipeline.pp_schedule targets, losses = (targets, []) if scheduled_pipeline.is_last_pp_stage else (None, None) if scheduled_pipeline.is_first_pp_stage: @@ -150,10 +161,11 @@ def _forward_step(self, scheduled_pipeline: Pipeline, inputs: torch.Tensor, targ ) def _forward_step_without_pp(self, inputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor: + os.environ["MODEL_TYPE"] = "NOPP" working_dir = Path(os.path.dirname(__file__)) fsdp2_model_config_path = working_dir / "configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml" fsdp2_components = self._get_components(fsdp2_model_config_path, use_pp=False) - fsdp2_model = fsdp2_components.initialized_model + fsdp2_model = fsdp2_components.fsdp_model fsdp2_loss_fn = fsdp2_components.loss_fn input_dict = {"input_ids": inputs} From b6a1e2d87968bfeb186027797c7ae0ee2caf906c Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Wed, 17 Sep 2025 19:47:25 +0200 Subject: [PATCH 35/67] refactor(parallelism): Some clean-up. --- src/modalities/evaluator.py | 15 ++++++++--- src/modalities/gym.py | 7 +++-- src/modalities/models/gpt2/gpt2_model.py | 4 +-- src/modalities/models/model_factory.py | 13 +++++----- src/modalities/trainer.py | 15 ++++++++--- .../test_pp_fwd_bwd_pass.py | 26 +++++++------------ 6 files changed, 45 insertions(+), 35 deletions(-) diff --git a/src/modalities/evaluator.py b/src/modalities/evaluator.py index 5d56bb90c..3f9f8f343 100644 --- a/src/modalities/evaluator.py +++ b/src/modalities/evaluator.py @@ -9,6 +9,7 @@ from modalities.logging_broker.messages import ExperimentStatus, MessageTypes, ProgressUpdate from modalities.logging_broker.publisher import MessagePublisher from modalities.models.model import model_predict_batch +from modalities.models.parallelism.pipeline_parallelism import Pipeline from modalities.running_env.fsdp.reducer import Reducer from modalities.trainer import ThroughputAggregationKeys from modalities.util import Aggregator, TimeRecorder @@ -36,17 +37,20 @@ def evaluate_batch( batch: DatasetBatch, model: nn.Module, loss_fun: Callable[[InferenceResultBatch], torch.Tensor], - scheduled_pipeline=None, # TODO set type - ) -> torch.Tensor: + scheduled_pipeline: Pipeline | None = None, + ) -> torch.Tensor | None: """Evaluate a single batch by forwarding it through the model and calculating the loss. Args: batch (DatasetBatch): The batch to evaluate model (nn.Module): The model to evaluate loss_fun (Callable[[InferenceResultBatch], torch.Tensor]): The loss function to calculate the loss + scheduled_pipeline (Pipeline | None, optional): In case of pipeline parallelism, this is used to + operate the model. Defaults to None. Returns: - torch.Tensor: The loss of the batch + torch.Tensor | None: The loss of the batch + None, if a non-last stage was processed in pipeline parallelism """ with torch.no_grad(): if scheduled_pipeline is not None: @@ -77,7 +81,7 @@ def evaluate( data_loaders: list[LLMDataLoader], loss_fun: Callable[[InferenceResultBatch], torch.Tensor], num_train_steps_done: int, - scheduled_pipeline=None, # TODO set type + scheduled_pipeline: Pipeline | None = None, ) -> dict[str, EvaluationResultBatch]: """Evaluate the model on a set of datasets. @@ -86,6 +90,8 @@ def evaluate( data_loaders (list[LLMDataLoader]): List of dataloaders to evaluate the model on loss_fun (Callable[[InferenceResultBatch], torch.Tensor]): The loss function to calculate the loss num_train_steps_done (int): The number of training steps done so far for logging purposes + scheduled_pipeline (Pipeline | None, optional): In case of pipeline parallelism, this is used to + operate the model. Defaults to None. Returns: dict[str, EvaluationResultBatch]: A dictionary containing the evaluation results for each dataloader @@ -113,6 +119,7 @@ def evaluate( scheduled_pipeline=scheduled_pipeline, ) + # The batch_loss might be None if we use pipeline parallelism and are not the last stage. if batch_loss is not None: cumulated_loss[0] += batch_loss.item() # sum up batch loss cumulated_loss[1] += 1 diff --git a/src/modalities/gym.py b/src/modalities/gym.py index 30f7a820f..7ea5e660f 100644 --- a/src/modalities/gym.py +++ b/src/modalities/gym.py @@ -9,6 +9,7 @@ from modalities.dataloader.dataloader import LLMDataLoader from modalities.evaluator import Evaluator from modalities.loss_functions import Loss +from modalities.models.parallelism.pipeline_parallelism import Pipeline from modalities.trainer import Trainer from modalities.training.training_progress import TrainingProgress from modalities.util import print_rank_0 @@ -40,7 +41,7 @@ def run( train_data_loader: LLMDataLoader, evaluation_data_loaders: list[LLMDataLoader], checkpoint_saving: CheckpointSaving, - scheduled_pipeline=None, # TODO set type + scheduled_pipeline: Pipeline | None = None, ): """Runs the model training, including evaluation and checkpointing. @@ -52,6 +53,8 @@ def run( train_data_loader (LLMDataLoader): Data loader with the training data. evaluation_data_loaders (list[LLMDataLoader]): List of data loaders with the evaluation data. checkpoint_saving (CheckpointSaving): Routine for saving checkpoints. + scheduled_pipeline (Pipeline | None, optional): In case of pipeline parallelism, this is used to + operate the model. Defaults to None. """ evaluation_callback: Callable[[int], None] = partial( self._run_evaluation, @@ -104,7 +107,7 @@ def _run_evaluation( num_train_steps_done: int, evaluation_data_loaders: list[LLMDataLoader], evaluation_interval_in_steps: int, - scheduled_pipeline=None, # TODO set type + scheduled_pipeline: Pipeline | None = None, ): if num_train_steps_done > 0 and num_train_steps_done % evaluation_interval_in_steps == 0: self.evaluator.evaluate( diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py index 1de776101..168c6ae26 100644 --- a/src/modalities/models/gpt2/gpt2_model.py +++ b/src/modalities/models/gpt2/gpt2_model.py @@ -781,7 +781,7 @@ def __init__( ffn_norm_config: LayerNormWrapperConfig, lm_head_norm_config: LayerNormWrapperConfig, use_weight_tying: bool, - seed: int = None, + seed: int | None = None, ): """ Initializes the GPT2LLM object. @@ -805,8 +805,8 @@ def __init__( attention_norm_config (LayerNormWrapperConfig): Config for the attention normalization module. ffn_norm_config (LayerNormWrapperConfig): Config for the feed-forward network normalization module. lm_head_norm_config (LayerNormWrapperConfig): Config for the language model head normalization module. - seed (int, optional): The random seed. Defaults to None. use_weight_tying (bool): Whether to use weight tying. + seed (int, optional): The random seed. Defaults to None. """ weight_decay_groups = { "linear": [".attn", ".mlp", ".lm_head.weight"], diff --git a/src/modalities/models/model_factory.py b/src/modalities/models/model_factory.py index 95c72c338..d463161f2 100644 --- a/src/modalities/models/model_factory.py +++ b/src/modalities/models/model_factory.py @@ -567,7 +567,7 @@ def get_gpt2_model( lm_head_norm_config: LayerNormWrapperConfig, use_weight_tying: bool, use_meta_device: Optional[bool] = False, - seed: int = None, + seed: int | None = None, ) -> GPT2LLM: config = dict( sample_key=sample_key, @@ -716,10 +716,11 @@ def get_gpt2_tensor_parallelized_model(model: GPT2LLM, device_mesh: DeviceMesh) or hasattr(transformer_block.mlp, k.split(".")[1]) ) } - parallelize_module( - module=transformer_block, - device_mesh=tp_mesh, - parallelize_plan=transformer_block_tp_plan, - ) + if transformer_block_tp_plan: + parallelize_module( + module=transformer_block, + device_mesh=tp_mesh, + parallelize_plan=transformer_block_tp_plan, + ) return model diff --git a/src/modalities/trainer.py b/src/modalities/trainer.py index 9960920ba..c60dd8542 100644 --- a/src/modalities/trainer.py +++ b/src/modalities/trainer.py @@ -14,6 +14,7 @@ from modalities.logging_broker.publisher import MessagePublisher from modalities.loss_functions import Loss from modalities.models.model import model_predict_batch +from modalities.models.parallelism.pipeline_parallelism import Pipeline from modalities.running_env.fsdp.reducer import Reducer from modalities.training.gradient_clipping.gradient_clipper import GradientClipperIF from modalities.training.training_progress import TrainingProgress @@ -97,8 +98,8 @@ def _train_batch( scheduler: LRScheduler, loss_fun: Loss, micro_batch_id: int, - scheduled_pipeline=None, # TODO set type - ) -> tuple[bool, int, torch.Tensor, Optional[torch.Tensor]]: + scheduled_pipeline: Optional[Pipeline] = None, + ) -> tuple[bool, int, Optional[torch.Tensor], Optional[torch.Tensor]]: """ Conducts a training step on batch of data. @@ -109,13 +110,16 @@ def _train_batch( scheduler (LRScheduler): The learning rate scheduler. loss_fun (Loss): The loss function used for training. micro_batch_id (int): The ID of the micro batch. + scheduled_pipeline (Optional[Pipeline], optional): In case of pipeline parallelism, this is used to + operate the model. Defaults to None. Returns: tuple[bool, int, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple containing the following: - step_performed (bool): Indicates whether a training step was performed. - num_train_steps_done (int): The number of training steps done. - - loss (torch.Tensor): The computed loss. + - loss (Optional[torch.Tensor]): The computed loss. + None, if a non-last stage was processes in pipeline parallelism. - gradient_norm_score (Optional[torch.Tensor]): The gradient norm score, if a training step was performed otherwise return None. """ @@ -164,7 +168,7 @@ def train( training_log_interval_in_steps: int, evaluation_callback: Callable[[TrainingProgress], None], checkpointing_callback: Callable[[TrainingProgress], None], - scheduled_pipeline=None, # TODO set type + scheduled_pipeline: Pipeline | None = None, ): """ Trains the model. @@ -176,6 +180,8 @@ def train( training_log_interval_in_steps (int): The interval at which training progress is logged. evaluation_callback (Callable[[TrainingProgress], None]): A callback function for evaluation. checkpointing_callback (Callable[[TrainingProgress], None]): A callback function for checkpointing. + scheduled_pipeline (Pipeline | None, optional): In case of pipeline parallelism, this is used to + operate the model. Defaults to None. Returns: None @@ -234,6 +240,7 @@ def train( training_progress.num_seen_steps_current_run = num_train_steps_done training_progress.num_seen_tokens_current_run = self.global_num_tokens_per_train_step * num_train_steps_done + # The batch_loss might be None if we use pipeline parallelism and are not the last stage. if batch_loss is not None: # Save the batch loss cumulated_losses[0] += batch_loss.item() diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py index 47e1fe990..73eb5863e 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py +++ b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py @@ -18,15 +18,8 @@ @pytest.fixture def temp_file_path() -> Path: - # Create a NamedTemporaryFile that persists after closing (delete=False) - with tempfile.NamedTemporaryFile(delete=False) as tf: - file_path = tf.name - try: - yield Path(file_path) - finally: - # Clean up the file after the test - if os.path.exists(file_path): - os.remove(file_path) + with tempfile.NamedTemporaryFile() as tf: + yield tf.name class ComponentsInstantiationPPModel(BaseModel): @@ -63,14 +56,14 @@ def _get_tmp_sharding_config_path( return temp_file_path - def _get_components(self, config_file_path: Path, use_pp: bool) -> ComponentsInstantiationPPModel: + def _get_components( + self, config_file_path: Path, use_pp: bool + ) -> ComponentsInstantiationPPModel | ComponentsInstantiationModel: torch.manual_seed(42) main_obj = Main(config_file_path) - if use_pp: - components_model_type = ComponentsInstantiationPPModel - else: - components_model_type = ComponentsInstantiationModel - components: components_model_type = main_obj.build_components(components_model_type=components_model_type) + components_model_type = ComponentsInstantiationPPModel if use_pp else ComponentsInstantiationModel + components = main_obj.build_components(components_model_type=components_model_type) + assert isinstance(components, components_model_type) return components @pytest.mark.parametrize( @@ -141,8 +134,7 @@ def _forward_step(self, scheduled_pipeline: Pipeline, inputs: torch.Tensor, targ else: pp_schedule.step(target=targets, losses=losses) - # accumulate losses across pipeline microbatches - # TODO: PP+FSDP unexpectedly puts the loss back to the CPU + # accumulate losses across pipeline microbatchess return ( torch.mean(torch.stack(losses)).to(losses[0].device) if scheduled_pipeline.is_last_pp_stage From c49895a987467475a90033c98d6b97ad256ff27d Mon Sep 17 00:00:00 2001 From: rrutmann Date: Fri, 19 Sep 2025 11:34:45 +0200 Subject: [PATCH 36/67] test: Update configs for parallelization testing --- ...g_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml | 34 ++-- ...orem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml | 41 ++-- ...m_ipsum_long_fsdp2_pp_tp_fwd_bwd_pass.yaml | 177 ++++++++++++++++++ 3 files changed, 212 insertions(+), 40 deletions(-) create mode 100644 tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_tp_fwd_bwd_pass.yaml diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml index 6603b1850..bdf991173 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml +++ b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml @@ -29,13 +29,28 @@ device_mesh: data_parallel_replicate_degree: 1 data_parallel_shard_degree: -1 world_size: ${settings.cuda_env.world_size} + +fsdp_model: + component_key: model + variant_key: fsdp2_wrapped + config: + model: + instance_key: initialized_model + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + mixed_precision_settings: + param_dtype: BF_16 + reduce_dtype: BF_16 + block_names: [GPT2Block] initialized_model: component_key: model variant_key: model_initialized config: model: - instance_key: fsdp_model + instance_key: model_raw pass_type: BY_REFERENCE model_initializer: component_key: model_initialization @@ -44,23 +59,8 @@ initialized_model: model_type: gpt2 weight_init_type: scaled mean: 0.0 - std: 0.02 + std: 1.02 num_layers: ${model_raw.config.n_layer} - -fsdp_model: - component_key: model - variant_key: fsdp2_wrapped - config: - model: - instance_key: model_raw - pass_type: BY_REFERENCE - device_mesh: - instance_key: device_mesh - pass_type: BY_REFERENCE - mixed_precision_settings: - param_dtype: BF_16 - reduce_dtype: BF_16 - block_names: [GPT2Block] model_raw: component_key: model diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml index 0ceb02a53..2ffbe4cc5 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml +++ b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml @@ -31,28 +31,6 @@ device_mesh: data_parallel_shard_degree: -1 world_size: ${settings.cuda_env.world_size} -initialized_model: - component_key: model - variant_key: model_initialized - config: - model: - component_key: pipeline - variant_key: selector - config: - pipeline: - instance_key: scheduled_pipeline - pass_type: BY_REFERENCE - selection_type: MODEL_PART - model_initializer: - component_key: model_initialization - variant_key: composed - config: - model_type: gpt2 - weight_init_type: scaled - mean: 0.0 - std: 0.02 - num_layers: ${model_raw.config.n_layer} - scheduled_pipeline: component_key: pipeline variant_key: scheduled @@ -109,7 +87,7 @@ staged_pipeline: variant_key: staged config: whole_model: - instance_key: model_raw + instance_key: initialized_model pass_type: BY_REFERENCE stages_generator: component_key: stages_generator @@ -125,6 +103,23 @@ staged_pipeline: pp_schedule_name: gpipe num_layers_per_stage: 4 +initialized_model: + component_key: model + variant_key: model_initialized + config: + model: + instance_key: model_raw + pass_type: BY_REFERENCE + model_initializer: + component_key: model_initialization + variant_key: composed + config: + model_type: gpt2 + weight_init_type: scaled + mean: 0.0 + std: 1.02 + num_layers: ${model_raw.config.n_layer} + model_raw: component_key: model variant_key: gpt2 diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_tp_fwd_bwd_pass.yaml b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_tp_fwd_bwd_pass.yaml new file mode 100644 index 000000000..fb8ee5f7d --- /dev/null +++ b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_tp_fwd_bwd_pass.yaml @@ -0,0 +1,177 @@ +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + step_profile: + gradient_accumulation_steps: 1 + local_train_micro_batch_size: 4 + sequence_length: 256 + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +device_mesh: + component_key: device_mesh + variant_key: default + config: + device_type: cuda + data_parallel_replicate_degree: 1 + pipeline_parallel_degree: 2 + data_parallel_shard_degree: -1 + world_size: ${settings.cuda_env.world_size} + +scheduled_pipeline: + component_key: pipeline + variant_key: scheduled + config: + loss_fn: + instance_key: loss_fn + pass_type: BY_REFERENCE + pp_schedule_name: gpipe + batch_size: ${settings.step_profile.local_train_micro_batch_size} + microbatch_size: 2 + pp_degree: ${device_mesh.config.pipeline_parallel_degree} + pipeline: + component_key: pipeline + variant_key: builder + config: + pp_stage: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: PP_STAGE + model_part: + instance_key: fsdp_model + pass_type: BY_REFERENCE + +fsdp_model: + component_key: model + variant_key: fsdp2_wrapped + config: + model: + instance_key: gpt2_tp_model + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + mixed_precision_settings: + param_dtype: BF_16 + reduce_dtype: BF_16 + block_names: [GPT2Block] + +gpt2_tp_model: + component_key: model + variant_key: gpt2_tp + config: + model: + instance_key: model_part + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + +model_part: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: MODEL_PART + +staged_pipeline: + component_key: pipeline + variant_key: staged + config: + whole_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + stages_generator: + component_key: stages_generator + variant_key: gpt2_stages_generator + config: + num_model_layers: ${model_raw.config.n_layer} + input_layer_equivalence: 1 + output_layer_equivalence: 1 + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + local_rank: ${settings.cuda_env.local_rank} + pp_schedule_name: gpipe + num_layers_per_stage: 4 + +initialized_model: + component_key: model + variant_key: model_initialized + config: + model: + instance_key: model_raw + pass_type: BY_REFERENCE + model_initializer: + component_key: model_initialization + variant_key: composed + config: + model_type: gpt2 + weight_init_type: scaled + mean: 0.0 + std: 0.02 + num_layers: ${model_raw.config.n_layer} + +model_raw: + component_key: model + variant_key: gpt2 + config: + use_meta_device: true + use_weight_tying: false + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + sequence_length: ${settings.step_profile.sequence_length} + prediction_key: ${loss_fn.config.prediction_key} + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 6 + n_head_q: 8 + n_head_kv: 4 + ffn_hidden: 128 + n_embd: 128 + dropout: 0.0 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model_raw.config.n_embd} + n_head: ${model_raw.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + base_freq: 10000 + attention_implementation: manual + activation_type: swiglu + attention_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + ffn_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + lm_head_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + From f685fc5d88b9a524dd7a18609ec6341946487f67 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Fri, 19 Sep 2025 12:43:22 +0200 Subject: [PATCH 37/67] test: Use correct length to create test sequences --- .../pipeline_parallelism/test_pp_fwd_bwd_pass.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py index 8906c5f86..dabd731b9 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py +++ b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py @@ -71,9 +71,9 @@ def _get_components( @pytest.mark.parametrize( "sharding_degree, tp_degree, pp_degree, world_size", [ - # (2, 1, 2, 4), - # (2, 1, 4, 8), - (2, 2, 2, 8), # TODO need to support this case + (2, 1, 2, 4), + (2, 1, 4, 8), + (2, 2, 2, 8), ], ) def test_pp(self, sharding_degree: int, tp_degree: int, pp_degree: int, world_size: int, temp_file_path: Path): @@ -105,10 +105,10 @@ def _test_pp_impl( rdvz_port=22359, ): vocab_size = 50304 - sequence_length = 4 + sequence_length = 256 batch_size = 4 torch.manual_seed(42) - sequences = torch.randint(0, vocab_size, (batch_size, sequence_length)) + sequences = torch.randint(0, vocab_size, (batch_size, sequence_length + 1)) targets = sequences[:, 1:].contiguous() inputs = sequences[:, :-1].contiguous() From c07fcf6b2a077aa5ddc5d13d7cc5fedd39486fd2 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Fri, 19 Sep 2025 12:45:10 +0200 Subject: [PATCH 38/67] test: Use realistic std for model initialization --- .../configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml | 2 +- .../configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml index bdf991173..988e70eba 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml +++ b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_fwd_bwd_pass.yaml @@ -59,7 +59,7 @@ initialized_model: model_type: gpt2 weight_init_type: scaled mean: 0.0 - std: 1.02 + std: 0.02 num_layers: ${model_raw.config.n_layer} model_raw: diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml index 2ffbe4cc5..f41e912bc 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml +++ b/tests/fsdp2_parallelization/pipeline_parallelism/configs/config_lorem_ipsum_long_fsdp2_pp_fwd_bwd_pass.yaml @@ -117,7 +117,7 @@ initialized_model: model_type: gpt2 weight_init_type: scaled mean: 0.0 - std: 1.02 + std: 0.02 num_layers: ${model_raw.config.n_layer} model_raw: From 5019bbb869621b09d492cb1deeca1ed6473662b5 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Fri, 19 Sep 2025 12:45:53 +0200 Subject: [PATCH 39/67] fix: Remove unused third dimension for reduced_losses --- src/modalities/trainer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/modalities/trainer.py b/src/modalities/trainer.py index c60dd8542..536f98409 100644 --- a/src/modalities/trainer.py +++ b/src/modalities/trainer.py @@ -282,9 +282,7 @@ def train( operation=dist.ReduceOp.SUM, # 1.) summed batch loss / (num batches * world size) # 2.) last batch loss / world size - post_processing_fun=lambda t: torch.stack( - [t[0] / t[-1], t[1] / self.num_data_parallel_ranks, t[-1]] - ), + post_processing_fun=lambda t: torch.stack([t[0] / t[-1], t[1] / self.num_data_parallel_ranks]), ) train_loss_avg, train_loss_last_batch = ( From a08e555af1db303efc45c0f33144005b39e64e64 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Fri, 19 Sep 2025 12:46:36 +0200 Subject: [PATCH 40/67] refactor: Remove unused filtering --- src/modalities/models/model_factory.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/src/modalities/models/model_factory.py b/src/modalities/models/model_factory.py index d463161f2..a96a9bc2d 100644 --- a/src/modalities/models/model_factory.py +++ b/src/modalities/models/model_factory.py @@ -706,21 +706,10 @@ def get_gpt2_tensor_parallelized_model(model: GPT2LLM, device_mesh: DeviceMesh) ) transformer_block.attn.n_head_q = transformer_block.attn.n_head_q // tp_mesh.size() transformer_block.attn.n_head_kv = transformer_block.attn.n_head_kv // tp_mesh.size() - # only keep the relevant parts of the model parallel plan - transformer_block_tp_plan = { - k: v - for k, v in transformer_block_tp_plan.items() - if ( - hasattr(transformer_block, k) - or hasattr(transformer_block.attn, k.split(".")[1]) - or hasattr(transformer_block.mlp, k.split(".")[1]) - ) - } - if transformer_block_tp_plan: - parallelize_module( - module=transformer_block, - device_mesh=tp_mesh, - parallelize_plan=transformer_block_tp_plan, - ) + parallelize_module( + module=transformer_block, + device_mesh=tp_mesh, + parallelize_plan=transformer_block_tp_plan, + ) return model From 45b54188a97c0fd7c0ffcca84e41c0176cec51a0 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Mon, 22 Sep 2025 14:44:39 +0200 Subject: [PATCH 41/67] fix: Aggregate loss of last train batch correct across pp ranks --- src/modalities/main.py | 8 ++++---- .../running_env/fsdp/device_mesh.py | 19 +++++-------------- src/modalities/trainer.py | 10 ++++++---- tests/conftest.py | 2 +- 4 files changed, 16 insertions(+), 23 deletions(-) diff --git a/src/modalities/main.py b/src/modalities/main.py index 9836a4d8e..2f680cf71 100644 --- a/src/modalities/main.py +++ b/src/modalities/main.py @@ -20,7 +20,7 @@ from modalities.logging_broker.subscriber import MessageSubscriberIF from modalities.registry.components import COMPONENTS from modalities.registry.registry import Registry -from modalities.running_env.fsdp.device_mesh import get_num_data_parallel_ranks +from modalities.running_env.fsdp.device_mesh import ParallelismDegrees, get_num_parallel_ranks from modalities.trainer import Trainer from modalities.util import get_synced_experiment_id_of_run, get_total_number_of_trainable_parameters, print_rank_0 @@ -118,9 +118,9 @@ def run(self, components: TrainingComponentsInstantiationModel): * components.settings.cuda_env.world_size ) if components.device_mesh is None: - num_data_parallel_ranks = 1 + num_pipeline_parallel_ranks = 1 else: - num_data_parallel_ranks = get_num_data_parallel_ranks(components.device_mesh) + num_pipeline_parallel_ranks = get_num_parallel_ranks(components.device_mesh, ParallelismDegrees.PP) trainer = Trainer( global_rank=components.settings.cuda_env.global_rank, progress_publisher=progress_publisher, @@ -133,7 +133,7 @@ def run(self, components: TrainingComponentsInstantiationModel): gradient_clipper=components.gradient_clipper, global_num_tokens_per_train_step=global_num_tokens_per_train_step, mfu_calculator=components.mfu_calculator, - num_data_parallel_ranks=num_data_parallel_ranks, + num_pipeline_parallel_ranks=num_pipeline_parallel_ranks, ) # Evaluator diff --git a/src/modalities/running_env/fsdp/device_mesh.py b/src/modalities/running_env/fsdp/device_mesh.py index c74751362..4770951f0 100644 --- a/src/modalities/running_env/fsdp/device_mesh.py +++ b/src/modalities/running_env/fsdp/device_mesh.py @@ -129,23 +129,14 @@ def get_device_mesh( return device_mesh -def get_num_data_parallel_ranks(device_mesh: DeviceMesh) -> int: - """Gets the number of data parallel ranks from the device mesh. +def get_num_parallel_ranks(device_mesh: DeviceMesh, parallelism_method: ParallelismDegrees) -> int: + """Gets the number of parallel ranks from the device mesh for a specific parallelism method. Args: device_mesh (DeviceMesh): The device mesh. + parallelism_method (ParallelismDegrees): The parallelism method. Returns: - int: The number of data parallel ranks. + int: The number of parallel ranks for the specified parallelism method. """ - world_size = device_mesh.size() - dp_size = world_size - for parallelism_degree in ( - ParallelismDegrees.TP.value, - ParallelismDegrees.PP.value, - ParallelismDegrees.CP.value, - ): - if parallelism_degree in device_mesh.mesh_dim_names: - dp_size //= device_mesh.size(device_mesh.mesh_dim_names.index(parallelism_degree)) - - return dp_size + return device_mesh.size(device_mesh.mesh_dim_names.index(parallelism_method.value)) diff --git a/src/modalities/trainer.py b/src/modalities/trainer.py index 536f98409..979e245dc 100644 --- a/src/modalities/trainer.py +++ b/src/modalities/trainer.py @@ -31,7 +31,7 @@ class Trainer: def __init__( self, global_rank: int, - num_data_parallel_ranks: int, + num_pipeline_parallel_ranks: int, progress_publisher: MessagePublisher[ProgressUpdate], evaluation_result_publisher: MessagePublisher[EvaluationResultBatch], gradient_acc_steps: int, @@ -64,7 +64,7 @@ def __init__( None """ self.global_rank = global_rank - self.num_data_parallel_ranks = num_data_parallel_ranks + self.num_pipeline_parallel_ranks = num_pipeline_parallel_ranks self.progress_publisher = progress_publisher self.evaluation_result_publisher = evaluation_result_publisher self.gradient_acc_steps = gradient_acc_steps @@ -281,8 +281,10 @@ def train( tensor=cumulated_losses, operation=dist.ReduceOp.SUM, # 1.) summed batch loss / (num batches * world size) - # 2.) last batch loss / world size - post_processing_fun=lambda t: torch.stack([t[0] / t[-1], t[1] / self.num_data_parallel_ranks]), + # 2.) last batch loss / (world size / num_pipeline_parallel_ranks) + post_processing_fun=lambda t: torch.stack( + [t[0] / t[-1], t[1] / dist.get_world_size() * self.num_pipeline_parallel_ranks] + ), ) train_loss_avg, train_loss_last_batch = ( diff --git a/tests/conftest.py b/tests/conftest.py index 9bf289122..9bcc5f1d6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -196,7 +196,7 @@ def trainer(progress_publisher_mock, gradient_clipper_mock): global_num_seen_tokens=0, num_target_tokens=100, num_target_steps=10, - num_data_parallel_ranks=int(os.getenv("WORLD_SIZE")), + num_pipeline_parallel_ranks=1, ) From a394ab0c6d43568d0f4a27baef8de2632cdc2d3e Mon Sep 17 00:00:00 2001 From: rrutmann Date: Mon, 22 Sep 2025 14:53:16 +0200 Subject: [PATCH 42/67] docs: Add example config for pipeline and tensor parallelism --- .../config_lorem_ipsum_long_fsdp2_pp_tp.yaml | 422 ++++++++++++++++++ 1 file changed, 422 insertions(+) create mode 100644 config_files/training/config_lorem_ipsum_long_fsdp2_pp_tp.yaml diff --git a/config_files/training/config_lorem_ipsum_long_fsdp2_pp_tp.yaml b/config_files/training/config_lorem_ipsum_long_fsdp2_pp_tp.yaml new file mode 100644 index 000000000..f7b4835f6 --- /dev/null +++ b/config_files/training/config_lorem_ipsum_long_fsdp2_pp_tp.yaml @@ -0,0 +1,422 @@ +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: 8 + paths: + checkpoint_saving_path: data/checkpoints + train_dataset_path: /raid/s3/opengptx/user/richard-rutmann/data/modalities/gpt2_tokenized/000_00000.pbin + test_dataset_path: ./data/lorem_ipsum.pbin + intervals: + training_log_interval_in_steps: 2 + checkpointing_interval_in_steps: 100000 + evaluation_interval_in_steps: 15 + consistency_enforcement: + enforce_tokens_per_step_consistency: true + enforce_last_step_logged: false + enforce_last_step_evaluated: false + enforce_last_step_checkpointed: false + step_profile: + gradient_accumulation_steps: 1 + local_train_micro_batch_size: 4 + sequence_length: 16 + training_target: + num_target_tokens: + component_key: number_conversion + variant_key: num_tokens_from_num_steps + config: + sequence_length: ${settings.step_profile.sequence_length} + num_ranks: ${settings.cuda_env.world_size} + local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size} + gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps} + num_steps: ${settings.training_target.num_target_steps} + num_target_steps: 20 + training_progress: + global_num_seen_tokens: 0 + num_seen_steps: 0 + num_seen_samples: 0 + last_step: -1 + +collate_fn: + component_key: collate_fn + variant_key: gpt_2_llm_collator + config: + sample_key: ${settings.referencing_keys.sample_key} + target_key: ${settings.referencing_keys.target_key} + +train_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: ${settings.paths.train_dataset_path} + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +train_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + dataloader_tag: train + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.step_profile.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: resumable_distributed_multi_dim_sampler + config: + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + data_parallel_key: dp_shard + shuffle: true + seed: 42 + drop_last: true + skip_num_global_samples: ${settings.training_progress.num_seen_samples} + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +test_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: ${settings.paths.test_dataset_path} + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +test_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + dataloader_tag: test + dataset: + instance_key: test_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.step_profile.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: resumable_distributed_multi_dim_sampler + config: + dataset: + instance_key: test_dataset + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + data_parallel_key: dp_shard + shuffle: true + seed: 42 + drop_last: true + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +eval_dataloaders: + - instance_key: test_dataloader + pass_type: BY_REFERENCE + +checkpoint_saving: + component_key: checkpoint_saving + variant_key: default + config: + checkpoint_saving_strategy: + component_key: checkpoint_saving_strategy + variant_key: save_k_most_recent_checkpoints_strategy + config: + k: -1 # -1 to save all checkpoints + checkpoint_saving_execution: + component_key: checkpoint_saving_execution + variant_key: dcp + config: + checkpoint_path: ${settings.paths.checkpoint_saving_path} + global_rank: ${settings.cuda_env.global_rank} + experiment_id: ${settings.experiment_id} + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +device_mesh: + component_key: device_mesh + variant_key: default + config: + device_type: cuda + data_parallel_replicate_degree: 1 + pipeline_parallel_degree: 2 + tensor_parallel_degree: 2 + data_parallel_shard_degree: -1 + world_size: ${settings.cuda_env.world_size} + +app_state: + component_key: app_state + variant_key: raw + config: + model: + instance_key: initialized_model + pass_type: BY_REFERENCE + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + lr_scheduler: + instance_key: lr_scheduler + pass_type: BY_REFERENCE + +initialized_model: + component_key: model + variant_key: model_initialized + config: + model: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: scheduled_pipeline + pass_type: BY_REFERENCE + selection_type: MODEL_PART + model_initializer: + component_key: model_initialization + variant_key: composed + config: + model_type: gpt2 + weight_init_type: scaled + mean: 0.0 + std: 0.02 + num_layers: ${model_raw.config.n_layer} + +scheduled_pipeline: + component_key: pipeline + variant_key: scheduled + config: + loss_fn: + instance_key: loss_fn + pass_type: BY_REFERENCE + pp_schedule_name: gpipe + batch_size: ${settings.step_profile.local_train_micro_batch_size} + microbatch_size: 2 + pp_degree: ${device_mesh.config.pipeline_parallel_degree} + # maybe better to use the fsdp model and the schedule here + # instead of passing in the staged pipeline? + # If fsdp_model creates a copy then this is not in the scope of + # the staged pipeline. + pipeline: + component_key: pipeline + variant_key: builder + config: + pp_stage: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: PP_STAGE + model_part: + instance_key: fsdp_model + pass_type: BY_REFERENCE + +fsdp_model: + component_key: model + variant_key: fsdp2_wrapped + config: + model: + instance_key: gpt2_tp_model + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + mixed_precision_settings: + param_dtype: BF_16 + reduce_dtype: BF_16 + block_names: [GPT2Block] + +gpt2_tp_model: + component_key: model + variant_key: gpt2_tp + config: + model: + instance_key: model_part + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + +model_part: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: MODEL_PART + +staged_pipeline: + component_key: pipeline + variant_key: staged + config: + whole_model: + instance_key: model_raw + pass_type: BY_REFERENCE + stages_generator: + component_key: stages_generator + variant_key: gpt2_stages_generator + config: + num_model_layers: ${model_raw.config.n_layer} + input_layer_equivalence: 1 + output_layer_equivalence: 1 + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + local_rank: ${settings.cuda_env.local_rank} + pp_schedule_name: gpipe + num_layers_per_stage: 2 + +model_raw: + component_key: model + variant_key: gpt2 + config: + seed: 42 + use_meta_device: true + use_weight_tying: false + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + sequence_length: ${settings.step_profile.sequence_length} + prediction_key: ${loss_fn.config.prediction_key} + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 2 + n_head_q: 8 + n_head_kv: 4 + ffn_hidden: 128 + n_embd: 128 + dropout: 0.0 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model_raw.config.n_embd} + n_head: ${model_raw.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + base_freq: 10000 + attention_implementation: manual + activation_type: swiglu + attention_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + ffn_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + lm_head_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + +lr_scheduler: + component_key: scheduler + variant_key: onecycle_lr + config: + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + max_lr: 6e-4 + div_factor: 10 + final_div_factor: 1 + total_steps: ${settings.training_target.num_target_steps} + pct_start: 0.01 + anneal_strategy: cos + last_epoch: ${settings.training_progress.last_step} + +optimizer: + component_key: optimizer + variant_key: adam_w + config: + lr: 0.0001 + betas: [0.9, 0.95] + eps: 1e-8 + weight_decay: 1e-1 + weight_decay_groups_excluded: [embedding, layernorm] + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + +gradient_clipper: + component_key: gradient_clipper + variant_key: fsdp2 + config: + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + norm_type: P2_NORM + max_norm: 1.0 + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + +progress_subscriber: + component_key: progress_subscriber + variant_key: rich + config: + global_rank: ${settings.cuda_env.global_rank} + num_seen_steps: ${settings.training_progress.num_seen_steps} + num_target_steps: ${settings.training_target.num_target_steps} + train_dataloader_tag: ${train_dataloader.config.dataloader_tag} + eval_dataloaders: + instance_key: eval_dataloaders + pass_type: BY_REFERENCE + +evaluation_subscriber: + component_key: results_subscriber + variant_key: wandb + config: + global_rank: ${settings.cuda_env.global_rank} + project: modalities_dcp_tests + mode: OFFLINE + experiment_id: ${settings.experiment_id} + directory: wandb_storage + config_file_path: ${settings.config_file_path} + +# mfu_calculator: +# component_key: mfu_calculator +# variant_key: gpt2 +# config: +# n_layer: ${model_raw.config.n_layer} +# sequence_length: ${settings.step_profile.sequence_length} +# n_embd: ${model_raw.config.n_embd} +# world_size: ${settings.cuda_env.world_size} +# raw_model: +# instance_key: model_raw +# pass_type: BY_REFERENCE +# wrapped_model: +# instance_key: initialized_model +# pass_type: BY_REFERENCE \ No newline at end of file From cae050ec1d37d8d1ddbb70b39cfd6880487fa084 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Mon, 22 Sep 2025 15:06:51 +0200 Subject: [PATCH 43/67] docs: Add docstrings and type hints --- src/modalities/models/gpt2/gpt2_model.py | 11 ++++++++++- src/modalities/models/model_factory.py | 9 +++++++++ src/modalities/running_env/fsdp/device_mesh.py | 3 ++- src/modalities/trainer.py | 18 ++++++++++++++++++ 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py index 168c6ae26..a1ae9fd2e 100644 --- a/src/modalities/models/gpt2/gpt2_model.py +++ b/src/modalities/models/gpt2/gpt2_model.py @@ -806,7 +806,7 @@ def __init__( ffn_norm_config (LayerNormWrapperConfig): Config for the feed-forward network normalization module. lm_head_norm_config (LayerNormWrapperConfig): Config for the language model head normalization module. use_weight_tying (bool): Whether to use weight tying. - seed (int, optional): The random seed. Defaults to None. + seed (Optional[int]): The random seed. Defaults to None. """ weight_decay_groups = { "linear": [".attn", ".mlp", ".lm_head.weight"], @@ -910,6 +910,15 @@ def forward(self, inputs: torch.Tensor) -> torch.Tensor: ... def forward(self, inputs: dict[str, torch.Tensor] | torch.Tensor) -> dict[str, torch.Tensor] | torch.Tensor: + """ + Forward pass of the GPT2LLM module. + + Args: + inputs (dict[str, torch.Tensor] | torch.Tensor): Input data. + + Returns: + dict[str, torch.Tensor] | torch.Tensor: Model output. + """ if isinstance(inputs, dict): return {self.prediction_key: self.forward_impl(inputs[self.sample_key])} else: diff --git a/src/modalities/models/model_factory.py b/src/modalities/models/model_factory.py index a96a9bc2d..d889c213a 100644 --- a/src/modalities/models/model_factory.py +++ b/src/modalities/models/model_factory.py @@ -58,6 +58,15 @@ class ModelFactory: @staticmethod def _is_model_on_meta_device(model: nn.Module) -> bool: + """ + Checks if all parameters and buffers of the model are on the meta device. + + Args: + model (nn.Module): The model to check. + + Returns: + bool: True if all parameters and buffers are on meta device, False otherwise. + """ meta_counter = 0 param_counter = 0 for _, tensor in itertools.chain(model.named_parameters(), model.named_buffers()): diff --git a/src/modalities/running_env/fsdp/device_mesh.py b/src/modalities/running_env/fsdp/device_mesh.py index 4770951f0..1e70c5323 100644 --- a/src/modalities/running_env/fsdp/device_mesh.py +++ b/src/modalities/running_env/fsdp/device_mesh.py @@ -84,7 +84,8 @@ def get_device_mesh( enable_loss_parallel: bool, world_size: int, ) -> DeviceMesh: - """Gets the device mesh for the specified parallelism degrees. + """ + Gets the device mesh for the specified parallelism degrees. Args: device_type (str): The device type. diff --git a/src/modalities/trainer.py b/src/modalities/trainer.py index 979e245dc..4b79dbedc 100644 --- a/src/modalities/trainer.py +++ b/src/modalities/trainer.py @@ -28,6 +28,24 @@ class ThroughputAggregationKeys(Enum): class Trainer: + """ + Trainer class for model training. + + Args: + global_rank (int): The global rank. + num_pipeline_parallel_ranks (int): Number of pipeline parallel ranks. + progress_publisher (MessagePublisher[ProgressUpdate]): Progress publisher. + evaluation_result_publisher (MessagePublisher[EvaluationResultBatch]): Evaluation result publisher. + gradient_acc_steps (int): Gradient accumulation steps. + global_num_tokens_per_train_step (int): Global number of tokens per train step. + num_seen_train_steps (int): Number of seen train steps. + global_num_seen_tokens (int): Global number of seen tokens. + num_target_steps (int): Number of target steps. + num_target_tokens (int): Number of target tokens. + gradient_clipper (GradientClipperIF): Gradient clipper. + mfu_calculator (Optional[MFUCalculatorABC]): MFU calculator. + """ + def __init__( self, global_rank: int, From 695223059f15bbcaefc682e10f40322046279422 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Mon, 22 Sep 2025 15:09:25 +0200 Subject: [PATCH 44/67] docs: Add type hints and docstrings --- src/modalities/trainer.py | 42 +++++++++++---------------------------- 1 file changed, 12 insertions(+), 30 deletions(-) diff --git a/src/modalities/trainer.py b/src/modalities/trainer.py index 4b79dbedc..bff3c7b47 100644 --- a/src/modalities/trainer.py +++ b/src/modalities/trainer.py @@ -28,24 +28,6 @@ class ThroughputAggregationKeys(Enum): class Trainer: - """ - Trainer class for model training. - - Args: - global_rank (int): The global rank. - num_pipeline_parallel_ranks (int): Number of pipeline parallel ranks. - progress_publisher (MessagePublisher[ProgressUpdate]): Progress publisher. - evaluation_result_publisher (MessagePublisher[EvaluationResultBatch]): Evaluation result publisher. - gradient_acc_steps (int): Gradient accumulation steps. - global_num_tokens_per_train_step (int): Global number of tokens per train step. - num_seen_train_steps (int): Number of seen train steps. - global_num_seen_tokens (int): Global number of seen tokens. - num_target_steps (int): Number of target steps. - num_target_tokens (int): Number of target tokens. - gradient_clipper (GradientClipperIF): Gradient clipper. - mfu_calculator (Optional[MFUCalculatorABC]): MFU calculator. - """ - def __init__( self, global_rank: int, @@ -65,18 +47,18 @@ def __init__( Initializes the Trainer object. Args: - global_rank (int): The global rank to which operates the trainer object. - progress_publisher (MessagePublisher[ProgressUpdate]): The publisher for progress updates. - evaluation_result_publisher (MessagePublisher[EvaluationResultBatch]): - The publisher for evaluation result batches. - gradient_acc_steps (int): The number of gradient accumulation steps. - global_num_tokens_per_train_step (int): The number of global tokens per training step. - num_seen_train_steps (int): The number of training steps already seen. - global_num_seen_tokens (int): The number of tokens already seen. - num_target_steps (int): The target number of training steps. - num_target_tokens (int): The target number of tokens. - gradient_clipper (GradientClipperIF): The gradient clipper. - mfu_calculator (Optional[MFUCalculatorABC]): The MFU calculator. + global_rank (int): The global rank. + num_pipeline_parallel_ranks (int): Number of pipeline parallel ranks. + progress_publisher (MessagePublisher[ProgressUpdate]): Progress publisher. + evaluation_result_publisher (MessagePublisher[EvaluationResultBatch]): Evaluation result publisher. + gradient_acc_steps (int): Gradient accumulation steps. + global_num_tokens_per_train_step (int): Global number of tokens per train step. + num_seen_train_steps (int): Number of seen train steps. + global_num_seen_tokens (int): Global number of seen tokens. + num_target_steps (int): Number of target steps. + num_target_tokens (int): Number of target tokens. + gradient_clipper (GradientClipperIF): Gradient clipper. + mfu_calculator (Optional[MFUCalculatorABC]): MFU calculator. Returns: None From ffa032cd82de6bdb1df4916c1d43baf9bca8b47d Mon Sep 17 00:00:00 2001 From: rrutmann Date: Mon, 22 Sep 2025 17:55:24 +0200 Subject: [PATCH 45/67] fix: Check if parallelism method is initialized --- src/modalities/running_env/fsdp/device_mesh.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/modalities/running_env/fsdp/device_mesh.py b/src/modalities/running_env/fsdp/device_mesh.py index 1e70c5323..5afa7d872 100644 --- a/src/modalities/running_env/fsdp/device_mesh.py +++ b/src/modalities/running_env/fsdp/device_mesh.py @@ -140,4 +140,7 @@ def get_num_parallel_ranks(device_mesh: DeviceMesh, parallelism_method: Parallel Returns: int: The number of parallel ranks for the specified parallelism method. """ - return device_mesh.size(device_mesh.mesh_dim_names.index(parallelism_method.value)) + if parallelism_method.value not in device_mesh.mesh_dim_names: + return 1 + else: + return device_mesh.size(device_mesh.mesh_dim_names.index(parallelism_method.value)) From 8d418a1b8ce90a4747a462a89c2bfd3b36e1a3c2 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Mon, 22 Sep 2025 17:55:44 +0200 Subject: [PATCH 46/67] docs: Add new parameter in docstring --- .../training/gradient_clipping/fsdp_gradient_clipper.py | 2 ++ .../gradient_clipping/fsdp_gradient_clipper_config.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py index d4b280a32..faeef0035 100644 --- a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py +++ b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py @@ -109,6 +109,7 @@ def __init__( wrapped_model (FSDP2): The wrapped model. max_norm (float): The maximum norm value for gradient clipping. norm_type (GradientClippingMode, optional): The type of gradient clipping. Defaults to GradientClippingMode. + device_mesh (DeviceMesh, optional): The device mesh used for distributed training. Defaults to None. Returns: None @@ -215,6 +216,7 @@ def __init__( Args: wrapped_model (FSDP2): The wrapped FSDP2 model. norm_type (GradientClippingMode, optional): The type of gradient clipping. Defaults to GradientClippingMode. + device_mesh (DeviceMesh, optional): The device mesh used for distributed training. Defaults to None. Returns: None diff --git a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py index 80ebee2a8..310fb9b60 100644 --- a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py +++ b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py @@ -34,11 +34,13 @@ class FSDP2GradientClipperConfig(FSDP1GradientClipperConfig): max_norm (float): The maximum norm value for gradient clipping. norm_type (GradientClippingMode): The type of gradient clipping to be applied. wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model. + device_mesh (PydanticDeviceMeshIFType | None): The device mesh configuration. Attributes: max_norm (float): The maximum norm value for gradient clipping. norm_type (GradientClippingMode): The type of gradient clipping to be applied. wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model. + device_mesh (PydanticDeviceMeshIFType | None): The device mesh configuration. """ device_mesh: PydanticDeviceMeshIFType | None = None @@ -68,10 +70,12 @@ class FSDP2DummyGradientClipperConfig(FSDP1DummyGradientClipperConfig): Args: wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model. norm_type (GradientClippingMode): The type of gradient clipping to be applied. + device_mesh (PydanticDeviceMeshIFType | None): The device mesh configuration. Attributes: wrapped_model (PydanticPytorchModuleType): The wrapped PyTorch model. norm_type (GradientClippingMode): The type of gradient clipping to be applied. + device_mesh (PydanticDeviceMeshIFType | None): The device mesh configuration. """ wrapped_model: PydanticPytorchModuleType From fffd0a1fc210fa3e2a1f31b7773c14c73d710303 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Tue, 23 Sep 2025 11:40:38 +0200 Subject: [PATCH 47/67] test: Run only one PP only test --- .../pipeline_parallelism/test_pp_fwd_bwd_pass.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py index dabd731b9..d255d62e0 100644 --- a/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py +++ b/tests/fsdp2_parallelization/pipeline_parallelism/test_pp_fwd_bwd_pass.py @@ -72,7 +72,6 @@ def _get_components( "sharding_degree, tp_degree, pp_degree, world_size", [ (2, 1, 2, 4), - (2, 1, 4, 8), (2, 2, 2, 8), ], ) From 049472f9bf68733b1b922ac6f3a36cf0ba68efe7 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Wed, 24 Sep 2025 13:07:20 +0200 Subject: [PATCH 48/67] refactor: Addressed copilot review --- src/modalities/running_env/fsdp/device_mesh.py | 7 +++++++ src/modalities/trainer.py | 1 - .../gradient_clipping/fsdp_gradient_clipper.py | 14 ++++---------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/modalities/running_env/fsdp/device_mesh.py b/src/modalities/running_env/fsdp/device_mesh.py index 5afa7d872..add2cc1da 100644 --- a/src/modalities/running_env/fsdp/device_mesh.py +++ b/src/modalities/running_env/fsdp/device_mesh.py @@ -144,3 +144,10 @@ def get_num_parallel_ranks(device_mesh: DeviceMesh, parallelism_method: Parallel return 1 else: return device_mesh.size(device_mesh.mesh_dim_names.index(parallelism_method.value)) + + +def get_mesh_for_parallelism_method(device_mesh: DeviceMesh | None, parallelism_method: ParallelismDegrees): + if device_mesh is not None and parallelism_method.value in device_mesh.mesh_dim_names: + return device_mesh[parallelism_method.value] + else: + return None diff --git a/src/modalities/trainer.py b/src/modalities/trainer.py index bff3c7b47..f81407f02 100644 --- a/src/modalities/trainer.py +++ b/src/modalities/trainer.py @@ -125,7 +125,6 @@ def _train_batch( """ if scheduled_pipeline is not None: pp_schedule = scheduled_pipeline.pp_schedule - # TODO: handle loss and backward in pp # Pipeline Parallel forward / backward inside step() call # with self.train_context(optional_context_parallel_ctx): targets, losses = ( diff --git a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py index faeef0035..c4009cf41 100644 --- a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py +++ b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py @@ -9,7 +9,7 @@ from torch.distributed.tensor import DTensor from modalities.config.lookup_enum import LookupEnum -from modalities.running_env.fsdp.device_mesh import ParallelismDegrees +from modalities.running_env.fsdp.device_mesh import ParallelismDegrees, get_mesh_for_parallelism_method from modalities.training.gradient_clipping.gradient_clipper import GradientClipperIF @@ -186,11 +186,7 @@ def clip_grad_norm_( total_norm = total_norm.full_tensor() - pp_mesh = ( - device_mesh[ParallelismDegrees.PP.value] - if device_mesh is not None and ParallelismDegrees.PP.value in device_mesh.mesh_dim_names - else None - ) + pp_mesh = get_mesh_for_parallelism_method(device_mesh=device_mesh, parallelism_method=ParallelismDegrees.PP) if pp_mesh is not None: if math.isinf(norm_type): dist.all_reduce(total_norm, op=dist.ReduceOp.MAX, group=pp_mesh.get_group()) @@ -242,10 +238,8 @@ def clip_gradients(self) -> torch.Tensor: # If only using PP, total_norm will be a local tensor. total_norm = total_norm.full_tensor() - pp_mesh = ( - self.device_mesh[ParallelismDegrees.PP.value] - if self.device_mesh is not None and ParallelismDegrees.PP.value in self.device_mesh.mesh_dim_names - else None + pp_mesh = get_mesh_for_parallelism_method( + device_mesh=self.device_mesh, parallelism_method=ParallelismDegrees.PP ) if pp_mesh is not None: if math.isinf(self.norm_type.value): From 608c7fc9b6d7d4300a62d4548cbbaf1b3059c4ab Mon Sep 17 00:00:00 2001 From: rrutmann Date: Wed, 15 Oct 2025 17:11:41 +0200 Subject: [PATCH 49/67] chore: Remove requirements for python and torch --- pyproject.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5a3c84bf1..1396b0cf5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,10 @@ [project] name = "modalities" version = "0.3.2" -requires-python = ">=3.10,<3.12" description = "Modalities, a PyTorch-native framework for distributed and reproducible foundation model training." readme = "README.md" dependencies = [ "numpy<2.0", - "torch==2.6.0", "packaging", "tqdm", "pyyaml", From 16c4bc479309baaa7da1eece431df6c5d0ccaf6a Mon Sep 17 00:00:00 2001 From: rrutmann Date: Fri, 17 Oct 2025 14:15:58 +0200 Subject: [PATCH 50/67] fix: Allow dp shard degree 1 --- src/modalities/running_env/fsdp/device_mesh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/modalities/running_env/fsdp/device_mesh.py b/src/modalities/running_env/fsdp/device_mesh.py index add2cc1da..e9f1f3e95 100644 --- a/src/modalities/running_env/fsdp/device_mesh.py +++ b/src/modalities/running_env/fsdp/device_mesh.py @@ -119,7 +119,7 @@ def get_device_mesh( ], strict=True, ): - if dim > 1: + if dim > 1 or name == ParallelismDegrees.DP_SHARD.value: dims.append(dim) names.append(name) names = tuple(names) From f5a10205b6a1568a9748f8931211f1cc425e2398 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Fri, 17 Oct 2025 14:17:34 +0200 Subject: [PATCH 51/67] test: Add test for checkpointing with pipeline parallelism --- .../checkpointing/checkpointing_test_utils.py | 36 +++- tests/checkpointing/fsdp2_pp_gpt2_config.yaml | 194 ++++++++++++++++++ ...fsdp2_dcp_checkpoint_loading_and_saving.py | 164 ++++++++++----- 3 files changed, 343 insertions(+), 51 deletions(-) create mode 100644 tests/checkpointing/fsdp2_pp_gpt2_config.yaml diff --git a/tests/checkpointing/checkpointing_test_utils.py b/tests/checkpointing/checkpointing_test_utils.py index 21c4caabe..7a3f241bf 100644 --- a/tests/checkpointing/checkpointing_test_utils.py +++ b/tests/checkpointing/checkpointing_test_utils.py @@ -15,10 +15,17 @@ class CheckpointingTestUtils: @staticmethod def generate_batch(gpt2_model_config: dict): # prepare input and targets + if "settings" in gpt2_model_config: + batch_size = gpt2_model_config["settings"]["step_profile"]["local_train_micro_batch_size"] + else: + batch_size = 8 data = torch.randint( 0, # lowest token_id gpt2_model_config["model_raw"]["config"]["vocab_size"], # highest token_id + 1, i.e. vocab_size - (8, gpt2_model_config["model_raw"]["config"]["sequence_length"] + 1), # (batch_size, sequence_length + 1) + ( + batch_size, + gpt2_model_config["model_raw"]["config"]["sequence_length"] + 1, + ), # (batch_size, sequence_length + 1) ).cuda() batch_input_ids_dict = {gpt2_model_config["model_raw"]["config"]["sample_key"]: data[:, :-1]} batch_target_ids = data[:, 1:] @@ -49,6 +56,33 @@ def forward_backward_pass( optimizer.step() return loss + @staticmethod + def forward_backward_pp_pass( + scheduled_pipeline, + optimizer: Optimizer, + batch_input_ids_dict: dict, + batch_target_ids: torch.Tensor, + ): + pp_schedule = scheduled_pipeline.pp_schedule + # Pipeline Parallel forward / backward inside step() call + # with self.train_context(optional_context_parallel_ctx): + targets, losses = (batch_target_ids.contiguous(), []) if scheduled_pipeline.is_last_pp_stage else (None, None) + + if scheduled_pipeline.is_first_pp_stage: + pp_schedule.step( + batch_input_ids_dict[scheduled_pipeline.model_part.sample_key].contiguous(), + target=targets, + losses=losses, + ) + else: + pp_schedule.step(target=targets, losses=losses) + loss = torch.mean(torch.stack(losses)).to(losses[0].device) if scheduled_pipeline.is_last_pp_stage else None + optimizer.step() + # clear the gradients + optimizer.zero_grad() + + return loss + @staticmethod def get_gpt2_model_from_config(gpt2_model_config_dict: dict) -> GPT2LLM: class GPT2InstantationModel(BaseModel): diff --git a/tests/checkpointing/fsdp2_pp_gpt2_config.yaml b/tests/checkpointing/fsdp2_pp_gpt2_config.yaml new file mode 100644 index 000000000..4a02aa6b2 --- /dev/null +++ b/tests/checkpointing/fsdp2_pp_gpt2_config.yaml @@ -0,0 +1,194 @@ +settings: + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + step_profile: + local_train_micro_batch_size: 8 + +app_state: + component_key: app_state + variant_key: raw + config: + model: + instance_key: fsdp_model + pass_type: BY_REFERENCE + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + +scheduled_pipeline: + component_key: pipeline + variant_key: scheduled + config: + loss_fn: + instance_key: loss_fn + pass_type: BY_REFERENCE + pp_schedule_name: gpipe + batch_size: ${settings.step_profile.local_train_micro_batch_size} + microbatch_size: 4 + pp_degree: ${device_mesh.config.pipeline_parallel_degree} + # maybe better to use the fsdp model and the schedule here + # instead of passing in the staged pipeline? + # If fsdp_model creates a copy then this is not in the scope of + # the staged pipeline. + pipeline: + component_key: pipeline + variant_key: builder + config: + pp_stage: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: PP_STAGE + model_part: + instance_key: fsdp_model + pass_type: BY_REFERENCE + +fsdp_model: + component_key: model + variant_key: fsdp2_wrapped + config: + model: + instance_key: model_part + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + mixed_precision_settings: + param_dtype: BF_16 + reduce_dtype: BF_16 + block_names: [GPT2Block] + +model_part: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: MODEL_PART + +staged_pipeline: + component_key: pipeline + variant_key: staged + config: + whole_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + stages_generator: + component_key: stages_generator + variant_key: gpt2_stages_generator + config: + num_model_layers: ${model_raw.config.n_layer} + input_layer_equivalence: 1 + output_layer_equivalence: 1 + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + local_rank: ${cuda_env:LOCAL_RANK} + pp_schedule_name: gpipe + num_layers_per_stage: 2 + +initialized_model: + component_key: model + variant_key: model_initialized + config: + model: + instance_key: model_raw + pass_type: BY_REFERENCE + model_initializer: + component_key: model_initialization + variant_key: composed + config: + model_type: gpt2 + weight_init_type: scaled + mean: 0.0 + std: 0.02 + num_layers: ${model_raw.config.n_layer} + +model_raw: + component_key: model + variant_key: gpt2 + config: + sample_key: "input_ids" # TODO reference this + poe_type: NOPE + prediction_key: "logits" # TODO reference this + sequence_length: 256 # TODO reference this (same as sequence length) + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 2 + n_head_q: 4 + n_head_kv: 4 + ffn_hidden: 128 + n_embd: 128 + dropout: 0.0 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model_raw.config.n_embd} + n_head: ${model_raw.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + base_freq: 10000 + attention_implementation: manual + activation_type: gelu + attention_norm_config: + norm_type: rms_norm + config: + ndim: ${model_raw.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm_config: + norm_type: rms_norm + config: + ndim: ${model_raw.config.n_embd} + bias: true + epsilon: 1e-5 + lm_head_norm_config: + norm_type: rms_norm + config: + ndim: ${model_raw.config.n_embd} + bias: true + epsilon: 1e-5 + use_weight_tying: false + use_meta_device: true + +optimizer: + component_key: optimizer + variant_key: adam_w + config: + lr: 0.0003 + betas: + - 0.9 + - 0.95 + eps: 1.0e-08 + weight_decay: 0.1 + weight_decay_groups_excluded: + - embedding + - layernorm + wrapped_model: + instance_key: fsdp_model + pass_type: BY_REFERENCE + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +device_mesh: + component_key: device_mesh + variant_key: default + config: + device_type: cuda + data_parallel_replicate_degree: 1 + pipeline_parallel_degree: 2 + data_parallel_shard_degree: -1 + world_size: ${cuda_env:WORLD_SIZE} diff --git a/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py b/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py index e1f0f349c..dc82f19d5 100644 --- a/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py +++ b/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py @@ -1,6 +1,7 @@ import json import os import tempfile +import traceback from copy import deepcopy from pathlib import Path @@ -16,7 +17,8 @@ from modalities.checkpointing.fsdp.fsdp_checkpoint_saving import DCPCheckpointSaving from modalities.checkpointing.stateful.app_state import AppState from modalities.config.config import ProcessGroupBackendType, load_app_config_dict -from modalities.config.pydantic_if_types import PydanticAppStateType +from modalities.config.pydantic_if_types import PydanticAppStateType, PydanticPipelineType +from modalities.models.parallelism.pipeline_parallelism import Pipeline from modalities.training.training_progress import TrainingProgress from tests.checkpointing.checkpointing_test_utils import CheckpointingTestUtils from tests.end2end_tests.custom_components import MultiProcessingCudaEnv @@ -41,8 +43,8 @@ def get_gpt2_model_config_dict(gpt2_model_config_path: Path) -> dict: @pytest.mark.skipif( - torch.cuda.device_count() < 2, - reason="This e2e test requires 2 GPUs", + torch.cuda.device_count() < 4, + reason="This e2e test requires 4 GPUs", ) class TestFSDP2DCPCheckpointing: @staticmethod @@ -57,11 +59,32 @@ class ComponentsInstantiationModel(BaseModel): return components.app_state @staticmethod - def test_save_checkpoint_after_backward_pass(temporary_checkpoint_folder_path: Path, gpt2_model_config_path: Path): - world_size = 2 + def _get_scheduled_pipeline(config_file_path: Path) -> Pipeline: + class ComponentsInstantiationModel(BaseModel): + scheduled_pipeline: PydanticPipelineType + + main_obj = Main(config_file_path) + components: ComponentsInstantiationModel = main_obj.build_components( + components_model_type=ComponentsInstantiationModel + ) + return components.scheduled_pipeline + + @staticmethod + @pytest.mark.parametrize( + "config_filename,world_size,use_pp", + [ + ("fsdp2_gpt2_config.yaml", 2, False), + ("fsdp2_pp_gpt2_config.yaml", 2, True), + ], + ) + def test_save_checkpoint_after_backward_pass( + temporary_checkpoint_folder_path: Path, config_filename: str, world_size: int, use_pp: bool + ): + working_dir = Path(os.path.dirname(__file__)) + config_file_path = working_dir / config_filename mp.spawn( TestFSDP2DCPCheckpointing._test_save_checkpoint_after_backward_pass_impl_wrapper, - args=(world_size, temporary_checkpoint_folder_path, gpt2_model_config_path), + args=(world_size, temporary_checkpoint_folder_path, config_file_path, use_pp), nprocs=world_size, join=True, ) @@ -72,6 +95,7 @@ def _test_save_checkpoint_after_backward_pass_impl_wrapper( world_size: int, temporary_checkpoint_folder_path: Path, gpt2_model_config_path: Path, + use_pp: bool, ): # wraps the actual test function to be able to run it in a distributed multiprocessing setup with MultiProcessingCudaEnv( @@ -79,31 +103,44 @@ def _test_save_checkpoint_after_backward_pass_impl_wrapper( global_rank=process_id, local_rank=process_id, world_size=world_size, - rdvz_port=22356, + rdvz_port=22358, ): - # build all the components for the test - app_state1 = TestFSDP2DCPCheckpointing._get_app_state(config_file_path=gpt2_model_config_path) - app_state2 = TestFSDP2DCPCheckpointing._get_app_state(config_file_path=gpt2_model_config_path) - - gpt2_model_config_dict = get_gpt2_model_config_dict(gpt2_model_config_path=gpt2_model_config_path) - experiment_id = "0" - checkpoint_loading = DCPCheckpointLoading(global_rank=process_id) - checkpoint_saving = DCPCheckpointSaving( - checkpoint_path=temporary_checkpoint_folder_path, - experiment_id=experiment_id, - global_rank=process_id, - ) + try: + # build all the components for the test + app_state1 = TestFSDP2DCPCheckpointing._get_app_state(config_file_path=gpt2_model_config_path) + app_state2 = TestFSDP2DCPCheckpointing._get_app_state(config_file_path=gpt2_model_config_path) - # run the test - TestFSDP2DCPCheckpointing._test_save_checkpoint_after_backward_pass_impl( - app_state1=app_state1, - app_state2=app_state2, - gpt2_model_config_dict=gpt2_model_config_dict, - checkpoint_loading=checkpoint_loading, - checkpoint_saving=checkpoint_saving, - temporary_checkpoint_folder_path=temporary_checkpoint_folder_path, - experiment_id=experiment_id, - ) + if use_pp: + app_state1.scheduled_pipeline = TestFSDP2DCPCheckpointing._get_scheduled_pipeline( + config_file_path=gpt2_model_config_path + ) + app_state2.scheduled_pipeline = TestFSDP2DCPCheckpointing._get_scheduled_pipeline( + config_file_path=gpt2_model_config_path + ) + + gpt2_model_config_dict = get_gpt2_model_config_dict(gpt2_model_config_path=gpt2_model_config_path) + experiment_id = "0" + checkpoint_loading = DCPCheckpointLoading(global_rank=process_id) + checkpoint_saving = DCPCheckpointSaving( + checkpoint_path=temporary_checkpoint_folder_path, + experiment_id=experiment_id, + global_rank=process_id, + ) + + # run the test + TestFSDP2DCPCheckpointing._test_save_checkpoint_after_backward_pass_impl( + app_state1=app_state1, + app_state2=app_state2, + gpt2_model_config_dict=gpt2_model_config_dict, + checkpoint_loading=checkpoint_loading, + checkpoint_saving=checkpoint_saving, + temporary_checkpoint_folder_path=temporary_checkpoint_folder_path, + experiment_id=experiment_id, + ) + except Exception as e: + print(f"Exception in _forward_step_with_pp: {e}") + traceback.print_exc() # <-- Add this line to print the full stack trace + raise e @staticmethod def _test_save_checkpoint_after_backward_pass_impl( @@ -139,13 +176,21 @@ def _test_save_checkpoint_after_backward_pass_impl( # run backward pass batch_input_ids_dict, batch_target_ids = CheckpointingTestUtils.generate_batch(gpt2_model_config_dict) - loss_0 = CheckpointingTestUtils.forward_backward_pass( - prediction_key=prediction_key, - model=app_state1.model, - optimizer=app_state1.optimizer, - batch_input_ids_dict=batch_input_ids_dict, - batch_target_ids=batch_target_ids, - ) + if hasattr(app_state1, "scheduled_pipeline"): + loss_0 = CheckpointingTestUtils.forward_backward_pp_pass( + scheduled_pipeline=app_state1.scheduled_pipeline, + optimizer=app_state1.optimizer, + batch_input_ids_dict=batch_input_ids_dict, + batch_target_ids=batch_target_ids, + ) + else: + loss_0 = CheckpointingTestUtils.forward_backward_pass( + prediction_key=prediction_key, + model=app_state1.model, + optimizer=app_state1.optimizer, + batch_input_ids_dict=batch_input_ids_dict, + batch_target_ids=batch_target_ids, + ) # save the updated model and optimizer states for later comparisons updated_model_parameters = CheckpointingTestUtils.clone_parameters(app_state1.model) @@ -198,21 +243,40 @@ def _test_save_checkpoint_after_backward_pass_impl( loaded_and_updated_optimizer_state_dict = deepcopy(app_state1.optimizer.state_dict()) # perform another forward pass and backward pass for the previous and the loaded model - loss_1 = CheckpointingTestUtils.forward_backward_pass( - prediction_key=prediction_key, - model=app_state1.model, - optimizer=app_state1.optimizer, - batch_input_ids_dict=batch_input_ids_dict, - batch_target_ids=batch_target_ids, - ) + if hasattr(app_state1, "scheduled_pipeline"): + try: + # loss_1 = CheckpointingTestUtils.forward_backward_pp_pass( + # scheduled_pipeline=app_state1.scheduled_pipeline, + # optimizer=app_state1.optimizer, + # batch_input_ids_dict=batch_input_ids_dict, + # batch_target_ids=batch_target_ids, + # ) + loss_2 = CheckpointingTestUtils.forward_backward_pp_pass( + scheduled_pipeline=app_state2.scheduled_pipeline, + optimizer=app_state2.optimizer, + batch_input_ids_dict=batch_input_ids_dict, + batch_target_ids=batch_target_ids, + ) + except Exception as e: + print(f"Exception in _forward_step_with_pp: {e}") + traceback.print_exc() # <-- Add this line to print the full stack trace + raise e + else: + loss_1 = CheckpointingTestUtils.forward_backward_pass( + prediction_key=prediction_key, + model=app_state1.model, + optimizer=app_state1.optimizer, + batch_input_ids_dict=batch_input_ids_dict, + batch_target_ids=batch_target_ids, + ) - loss_2 = CheckpointingTestUtils.forward_backward_pass( - prediction_key=prediction_key, - model=app_state2.model, - optimizer=app_state2.optimizer, - batch_input_ids_dict=batch_input_ids_dict, - batch_target_ids=batch_target_ids, - ) + loss_2 = CheckpointingTestUtils.forward_backward_pass( + prediction_key=prediction_key, + model=app_state2.model, + optimizer=app_state2.optimizer, + batch_input_ids_dict=batch_input_ids_dict, + batch_target_ids=batch_target_ids, + ) assert loss_1 == loss_2, f"loss_1 = {loss_1} does not equal loss_2 = {loss_2}" assert loss_1 < loss_0, f"loss_1 = {loss_1} is not less than loss_0 = {loss_0}" From 9d1f107d587cc6d6709609c41c20afe2bd0a410e Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Fri, 17 Oct 2025 18:44:05 +0200 Subject: [PATCH 52/67] fix(parallelism): Building model stages in PP now also filters the model's weight_decay_groups. --- .../parallelism/pipeline_parallelism.py | 31 +++++++++++++++---- .../optimizers/optimizer_factory.py | 12 +++---- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/src/modalities/models/parallelism/pipeline_parallelism.py b/src/modalities/models/parallelism/pipeline_parallelism.py index 006d97a55..9d7e97718 100644 --- a/src/modalities/models/parallelism/pipeline_parallelism.py +++ b/src/modalities/models/parallelism/pipeline_parallelism.py @@ -3,8 +3,9 @@ # licensed under the BSD 3-Clause License. import copy +import re from enum import Enum -from typing import Any, Optional, Type +from typing import Any, Optional, Type, cast import torch import torch.nn as nn @@ -13,6 +14,7 @@ from torch.distributed.pipelining.schedules import PipelineScheduleSingle, get_schedule_class from modalities.loss_functions import Loss +from modalities.models.model import NNModel from modalities.models.parallelism.stages_generator import StagesGenerator from modalities.running_env.fsdp.device_mesh import ParallelismDegrees from modalities.utils.logger_utils import get_logger @@ -83,13 +85,13 @@ class PipelineFactory: @staticmethod def get_pipeline( - pp_stage: PipelineStage, model_part: nn.Module, pp_schedule: Optional[PipelineScheduleSingle] = None + pp_stage: PipelineStage, model_part: NNModel, pp_schedule: Optional[PipelineScheduleSingle] = None ) -> Pipeline: return Pipeline(pp_stage=pp_stage, model_part=model_part, pp_schedule=pp_schedule) @staticmethod def get_staged_pipeline( - whole_model: nn.Module, + whole_model: NNModel, stages_generator: StagesGenerator, device_mesh: DeviceMesh, local_rank: int, @@ -128,12 +130,12 @@ def get_staged_pipeline( @staticmethod def _get_split_model( - whole_model: nn.Module, + whole_model: NNModel, schedule_class: Type[PipelineScheduleSingle], pp_mesh: DeviceMesh, device: torch.device, fqns_per_stage: list[list[str]], - ) -> tuple[PipelineStage, nn.Module]: + ) -> tuple[PipelineStage, NNModel]: def get_stage_id_of_pp_rank(pp_mesh: DeviceMesh): # NOTE: torch titan a more complicated way to get the stage id of pp rank # since they also allow for multi-stage schedules @@ -164,7 +166,7 @@ def _get_fqn_tree(fqns: list[str]) -> dict[str, Any]: def _build_stage_from_modules( fqn_tree: dict[str, Any], module: nn.Module, module_name: Optional[str] = None - ) -> tuple[PipelineStage, nn.Module]: + ) -> nn.Module: if isinstance(module, nn.ModuleDict): if module_name not in fqn_tree: dict_modules = nn.ModuleDict({}) @@ -239,6 +241,8 @@ def _build_stage_from_modules( whole_model = copy.deepcopy(whole_model) fqn_tree = _get_fqn_tree(module_names) stage_modules = _build_stage_from_modules(fqn_tree, whole_model) + stage_modules = cast(NNModel, stage_modules) + PipelineFactory._filter_weight_decay_groups_(stage_modules) stage = PipelineStage( submodule=stage_modules, stage_index=stage_idx, @@ -248,6 +252,21 @@ def _build_stage_from_modules( ) return stage, stage_modules + @staticmethod + def _filter_weight_decay_groups_(stage_modules: NNModel): + params = {name for name, parameter in stage_modules.named_parameters() if parameter.requires_grad} + for group_list in stage_modules.weight_decay_groups.values(): + remove_from_group = [ + group_entry + for group_entry in group_list + if all([not bool(re.search(group_entry, name)) for name in params]) + ] + for remove in remove_from_group: + group_list.remove(remove) + empty_group_keys = [k for k, v in stage_modules.weight_decay_groups.items() if len(v) == 0] + for key in empty_group_keys: + del stage_modules.weight_decay_groups[key] + @staticmethod def get_scheduled_pipeline( loss_fn: Loss, pp_schedule_name: str, batch_size: int, microbatch_size: int, pp_degree: int, pipeline: Pipeline diff --git a/src/modalities/optimizers/optimizer_factory.py b/src/modalities/optimizers/optimizer_factory.py index c430e82a1..5a0ae2bdc 100644 --- a/src/modalities/optimizers/optimizer_factory.py +++ b/src/modalities/optimizers/optimizer_factory.py @@ -12,6 +12,7 @@ from modalities.exceptions import OptimizerError from modalities.models.model import NNModel from modalities.util import get_local_number_of_trainable_parameters, print_rank_0 +from modalities.utils.logger_utils import get_logger from modalities.utils.typing_utils import FSDPX OptimizerGroups = list[dict[str, list[nn.Parameter] | float]] @@ -80,7 +81,7 @@ def get_optimizer_groups(model: FSDP, weight_decay: float, weight_decay_groups_e optimizer_groups_names = ["all"] else: # there will be N optimizer groups, i.e. one for each model parameter group - _assert_existence_of_weight_decay_groups_excluded(model, weight_decay_groups_excluded) + _check_existence_of_weight_decay_groups_excluded(model, weight_decay_groups_excluded) optimizer_groups, optimizer_groups_names = _create_optimizer_groups( model, weight_decay, weight_decay_groups_excluded ) @@ -90,9 +91,7 @@ def get_optimizer_groups(model: FSDP, weight_decay: float, weight_decay_groups_e return optimizer_groups -def _assert_existence_of_weight_decay_groups_excluded( - model: nn.Module, weight_decay_groups_excluded: list[str] -) -> None: +def _check_existence_of_weight_decay_groups_excluded(model: nn.Module, weight_decay_groups_excluded: list[str]) -> None: """ checks the existence of all groups that are to be excluded from weight decay @@ -113,9 +112,10 @@ def _assert_existence_of_weight_decay_groups_excluded( weight_decay_groups = nn_model.weight_decay_groups for group in weight_decay_groups_excluded: if group not in weight_decay_groups.keys(): - raise OptimizerError( + get_logger(name="optimizer_factory").warning( f"group = {group} specified in weight_decay_groups_excluded is not " - + f"in models optimizer_module_groups = {list(weight_decay_groups.keys())}" + + f"in models optimizer_module_groups = {list(weight_decay_groups.keys())}. " + + "(This might be due to pipeline parallelism and is not necessarily an error.)" ) From dfc1bdebbcac047daf9f74f7d521d219a6cb48cb Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Fri, 17 Oct 2025 18:46:02 +0200 Subject: [PATCH 53/67] test(checkpointing): Some fixes for pp checkpointing test. --- ...fsdp2_dcp_checkpoint_loading_and_saving.py | 56 ++++++++----------- 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py b/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py index dc82f19d5..77fcd3edb 100644 --- a/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py +++ b/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py @@ -18,7 +18,6 @@ from modalities.checkpointing.stateful.app_state import AppState from modalities.config.config import ProcessGroupBackendType, load_app_config_dict from modalities.config.pydantic_if_types import PydanticAppStateType, PydanticPipelineType -from modalities.models.parallelism.pipeline_parallelism import Pipeline from modalities.training.training_progress import TrainingProgress from tests.checkpointing.checkpointing_test_utils import CheckpointingTestUtils from tests.end2end_tests.custom_components import MultiProcessingCudaEnv @@ -48,26 +47,26 @@ def get_gpt2_model_config_dict(gpt2_model_config_path: Path) -> dict: ) class TestFSDP2DCPCheckpointing: @staticmethod - def _get_app_state(config_file_path: Path) -> AppState: - class ComponentsInstantiationModel(BaseModel): - app_state: PydanticAppStateType + def _get_app_state(config_file_path: Path, use_pp: bool = False) -> AppState: + if use_pp: - main_obj = Main(config_file_path) - components: ComponentsInstantiationModel = main_obj.build_components( - components_model_type=ComponentsInstantiationModel - ) - return components.app_state + class ComponentsInstantiationModel(BaseModel): + app_state: PydanticAppStateType + scheduled_pipeline: PydanticPipelineType - @staticmethod - def _get_scheduled_pipeline(config_file_path: Path) -> Pipeline: - class ComponentsInstantiationModel(BaseModel): - scheduled_pipeline: PydanticPipelineType + else: + + class ComponentsInstantiationModel(BaseModel): + app_state: PydanticAppStateType main_obj = Main(config_file_path) components: ComponentsInstantiationModel = main_obj.build_components( components_model_type=ComponentsInstantiationModel ) - return components.scheduled_pipeline + app_state = components.app_state + if use_pp: + app_state.scheduled_pipeline = components.scheduled_pipeline + return app_state @staticmethod @pytest.mark.parametrize( @@ -103,20 +102,12 @@ def _test_save_checkpoint_after_backward_pass_impl_wrapper( global_rank=process_id, local_rank=process_id, world_size=world_size, - rdvz_port=22358, + rdvz_port=22355, ): try: # build all the components for the test - app_state1 = TestFSDP2DCPCheckpointing._get_app_state(config_file_path=gpt2_model_config_path) - app_state2 = TestFSDP2DCPCheckpointing._get_app_state(config_file_path=gpt2_model_config_path) - - if use_pp: - app_state1.scheduled_pipeline = TestFSDP2DCPCheckpointing._get_scheduled_pipeline( - config_file_path=gpt2_model_config_path - ) - app_state2.scheduled_pipeline = TestFSDP2DCPCheckpointing._get_scheduled_pipeline( - config_file_path=gpt2_model_config_path - ) + app_state1 = TestFSDP2DCPCheckpointing._get_app_state(gpt2_model_config_path, use_pp) + app_state2 = TestFSDP2DCPCheckpointing._get_app_state(gpt2_model_config_path, use_pp) gpt2_model_config_dict = get_gpt2_model_config_dict(gpt2_model_config_path=gpt2_model_config_path) experiment_id = "0" @@ -245,12 +236,12 @@ def _test_save_checkpoint_after_backward_pass_impl( # perform another forward pass and backward pass for the previous and the loaded model if hasattr(app_state1, "scheduled_pipeline"): try: - # loss_1 = CheckpointingTestUtils.forward_backward_pp_pass( - # scheduled_pipeline=app_state1.scheduled_pipeline, - # optimizer=app_state1.optimizer, - # batch_input_ids_dict=batch_input_ids_dict, - # batch_target_ids=batch_target_ids, - # ) + loss_1 = CheckpointingTestUtils.forward_backward_pp_pass( + scheduled_pipeline=app_state1.scheduled_pipeline, + optimizer=app_state1.optimizer, + batch_input_ids_dict=batch_input_ids_dict, + batch_target_ids=batch_target_ids, + ) loss_2 = CheckpointingTestUtils.forward_backward_pp_pass( scheduled_pipeline=app_state2.scheduled_pipeline, optimizer=app_state2.optimizer, @@ -278,7 +269,8 @@ def _test_save_checkpoint_after_backward_pass_impl( batch_target_ids=batch_target_ids, ) assert loss_1 == loss_2, f"loss_1 = {loss_1} does not equal loss_2 = {loss_2}" - assert loss_1 < loss_0, f"loss_1 = {loss_1} is not less than loss_0 = {loss_0}" + if loss_1 is not None: + assert loss_1 < loss_0, f"loss_1 = {loss_1} is not less than loss_0 = {loss_0}" # check that the model and optimizer states after each backward pass are as expected # model weights From cd9f5951c64c06035149a0e75454a4b0eed611e0 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Mon, 20 Oct 2025 11:38:47 +0200 Subject: [PATCH 54/67] test(checkpointing): Made dcp checkpointing test terminate correctly when failing in one of multiple subprocesses. --- ...fsdp2_dcp_checkpoint_loading_and_saving.py | 136 ++++++++++++++++-- 1 file changed, 127 insertions(+), 9 deletions(-) diff --git a/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py b/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py index 77fcd3edb..482d0cde6 100644 --- a/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py +++ b/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py @@ -1,6 +1,9 @@ import json +import logging +import multiprocessing as py_mp import os import tempfile +import time import traceback from copy import deepcopy from pathlib import Path @@ -81,13 +84,21 @@ def test_save_checkpoint_after_backward_pass( ): working_dir = Path(os.path.dirname(__file__)) config_file_path = working_dir / config_filename - mp.spawn( + # Use a Manager queue so child processes can report exceptions to the parent. + manager = py_mp.Manager() + error_queue = manager.Queue() + + # Start child processes without joining so the parent can monitor a shared queue + # and terminate remaining workers immediately if any child fails. + proc_ctx = mp.spawn( TestFSDP2DCPCheckpointing._test_save_checkpoint_after_backward_pass_impl_wrapper, - args=(world_size, temporary_checkpoint_folder_path, config_file_path, use_pp), + args=(world_size, temporary_checkpoint_folder_path, config_file_path, use_pp, error_queue), nprocs=world_size, - join=True, + join=False, ) + TestFSDP2DCPCheckpointing._monitor_child_processes(manager, error_queue, proc_ctx) + @staticmethod def _test_save_checkpoint_after_backward_pass_impl_wrapper( process_id: int, @@ -95,6 +106,7 @@ def _test_save_checkpoint_after_backward_pass_impl_wrapper( temporary_checkpoint_folder_path: Path, gpt2_model_config_path: Path, use_pp: bool, + error_queue: "py_mp.managers.SyncManager.Queue", ): # wraps the actual test function to be able to run it in a distributed multiprocessing setup with MultiProcessingCudaEnv( @@ -102,7 +114,7 @@ def _test_save_checkpoint_after_backward_pass_impl_wrapper( global_rank=process_id, local_rank=process_id, world_size=world_size, - rdvz_port=22355, + rdvz_port=22353, ): try: # build all the components for the test @@ -129,9 +141,14 @@ def _test_save_checkpoint_after_backward_pass_impl_wrapper( experiment_id=experiment_id, ) except Exception as e: - print(f"Exception in _forward_step_with_pp: {e}") - traceback.print_exc() # <-- Add this line to print the full stack trace - raise e + tb = traceback.format_exc() + logging.error(f"Process {process_id} encountered an error:\n{e}") + logging.error(tb) + try: + error_queue.put((process_id, tb)) + except Exception: + logging.error("Failed to put exception info into error queue.") + os._exit(1) @staticmethod def _test_save_checkpoint_after_backward_pass_impl( @@ -250,8 +267,8 @@ def _test_save_checkpoint_after_backward_pass_impl( ) except Exception as e: print(f"Exception in _forward_step_with_pp: {e}") - traceback.print_exc() # <-- Add this line to print the full stack trace - raise e + traceback.print_exc() + raise else: loss_1 = CheckpointingTestUtils.forward_backward_pass( prediction_key=prediction_key, @@ -307,3 +324,104 @@ def _test_save_checkpoint_after_backward_pass_impl( CheckpointingTestUtils.assert_equality_optimizer_state( app_state1.optimizer.state_dict(), updated_optimizer_state_dict, must_be_equal=False ) + + @staticmethod + def _monitor_child_processes(manager, error_queue, proc_ctx): + # Normalize the return value from mp.spawn. When join=False it often + # returns a ProcessContext-like object that may expose a `processes` + # attribute. Other implementations may return an iterable of Process + # objects. Build a `processes` list defensively so we can monitor and + # terminate child processes below without assuming a particular type. + processes = [] + if proc_ctx is None: + processes = [] + else: + # common attribute names that might hold the list of processes + candidate_attrs = ["processes", "_processes", "workers", "process_list", "processes_"] + found = False + for attr in candidate_attrs: + if hasattr(proc_ctx, attr): + ps = getattr(proc_ctx, attr) + try: + processes = list(ps) + except Exception: + processes = [ps] + found = True + break + if not found: + # If proc_ctx itself is iterable, exhaust it into a list + try: + processes = list(proc_ctx) + except Exception: + # Fallback: if proc_ctx behaves like a single process-like + # object (has terminate/is_alive/join), wrap it in a list. + if hasattr(proc_ctx, "terminate") or hasattr(proc_ctx, "is_alive") or hasattr(proc_ctx, "join"): + processes = [proc_ctx] + else: + processes = [] + + # Monitor the error queue and child processes. If any child reports an exception, + # terminate the other workers and raise the error in the parent to fail the test fast. + try: + # Loop until all processes finished or an error is reported + while True: + # If an error was reported by any child process, terminate remaining children + if not error_queue.empty(): + proc_id, tb = error_queue.get() + # terminate and join all processes (or the proc_ctx wrapper) + for p in processes: + try: + if hasattr(p, "is_alive"): + alive = p.is_alive() + elif hasattr(p, "exitcode"): + alive = getattr(p, "exitcode") is None + else: + alive = True + if alive and hasattr(p, "terminate"): + p.terminate() + except Exception: + pass + # If we didn't find individual process objects but proc_ctx + # exposes a terminate method, call it as a fallback. + try: + if not processes and hasattr(proc_ctx, "terminate"): + proc_ctx.terminate() + except Exception: + pass + + for p in processes: + try: + if hasattr(p, "join"): + p.join(timeout=5) + except Exception: + pass + try: + if hasattr(proc_ctx, "join"): + proc_ctx.join(timeout=1) + except Exception: + pass + raise AssertionError(f"Child process {proc_id} raised an exception:\n{tb}") + + # If all processes have finished, break + all_finished = all((not p.is_alive()) for p in processes) + if all_finished: + # join them to collect exitcodes + for p in processes: + try: + p.join() + except Exception: + pass + # If we have a ProcessContext, call its join to clean up as well + try: + if hasattr(proc_ctx, "join"): + proc_ctx.join(timeout=1) + except Exception: + pass + break + + time.sleep(0.05) + finally: + try: + manager.shutdown() + except Exception: + pass From edf7a4e5b818cd9916c114e31d9264d69884725a Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Tue, 21 Oct 2025 11:39:02 +0200 Subject: [PATCH 55/67] test(checkpointing): Checkpointing equality tests now explicitly only check the local tensor. --- .../checkpointing/checkpointing_test_utils.py | 38 +++++++++++++------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/tests/checkpointing/checkpointing_test_utils.py b/tests/checkpointing/checkpointing_test_utils.py index 7a3f241bf..c350ccbc8 100644 --- a/tests/checkpointing/checkpointing_test_utils.py +++ b/tests/checkpointing/checkpointing_test_utils.py @@ -1,5 +1,6 @@ import torch from pydantic import BaseModel +from torch.distributed.tensor import DTensor from torch.nn import CrossEntropyLoss from torch.optim import Optimizer @@ -128,19 +129,32 @@ def assert_equality_optimizer_state( state_2 = optimizer_2_state[param_group_id] assert set(state_1.keys()) == set(state_2.keys()) for state_key in state_1.keys(): - if must_be_equal: - assert torch.equal( - state_1[state_key], state_2[state_key] - ), "_assert_equality_optimizer_state failed (must_be_equal = True)" - else: - assert not torch.equal( - state_1[state_key], state_2[state_key] - ), "_assert_equality_optimizer_state failed (must_be_equal = False)" + CheckpointingTestUtils.assert_equality_two_tensors( + tensor_1=state_1[state_key], + tensor_2=state_2[state_key], + must_be_equal=must_be_equal, + msg_on_failure="_assert_equality_optimizer_state failed", + ) @staticmethod def assert_equality_two_models(params_1: list[torch.Tensor], params_2: list[torch.Tensor], must_be_equal: bool): for p1, p2 in zip(params_1, params_2): - if must_be_equal: - assert torch.equal(p1, p2), "_assert_equality_two_models failed (must_be_equal = True)" - else: - assert not torch.equal(p1, p2), "_assert_equality_two_models failed (must_be_equal = False)" + CheckpointingTestUtils.assert_equality_two_tensors( + tensor_1=p1, + tensor_2=p2, + must_be_equal=must_be_equal, + msg_on_failure="_assert_equality_two_models failed", + ) + + @staticmethod + def assert_equality_two_tensors( + tensor_1: torch.Tensor, tensor_2: torch.Tensor, must_be_equal: bool, msg_on_failure: str = "" + ): + if isinstance(tensor_1, DTensor): + assert isinstance(tensor_2, DTensor), f"{msg_on_failure} (type mismatch with DTensor)" + tensor_1 = tensor_1.to_local() + tensor_2 = tensor_2.to_local() + if must_be_equal: + assert torch.equal(tensor_1, tensor_2), f"{msg_on_failure} (must_be_equal = True)" + else: + assert not torch.equal(tensor_1, tensor_2), f"{msg_on_failure} (must_be_equal = False)" From abcf235d0824ac620e135c27628fdc87f38b417d Mon Sep 17 00:00:00 2001 From: rrutmann Date: Tue, 21 Oct 2025 12:06:51 +0200 Subject: [PATCH 56/67] fix: Use ModuleDict for transformer layers for correct checkpointing with pp --- src/modalities/models/gpt2/gpt2_model.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py index a1ae9fd2e..f50a146e0 100644 --- a/src/modalities/models/gpt2/gpt2_model.py +++ b/src/modalities/models/gpt2/gpt2_model.py @@ -845,9 +845,9 @@ def __init__( wte=nn.Embedding(num_embeddings=vocab_size, embedding_dim=n_embd), wpe=wpe, drop=nn.Dropout(dropout), - h=nn.ModuleList( - [ - GPT2Block( + h=nn.ModuleDict( + { + str(layer_id): GPT2Block( n_embd=n_embd, bias=bias, n_head_q=n_head_q, @@ -863,8 +863,8 @@ def __init__( attention_norm=attention_norm_config.norm_type.value(**dict(attention_norm_config.config)), ffn_norm=ffn_norm_config.norm_type.value(**dict(ffn_norm_config.config)), ) - for _ in range(n_layer) - ] + for layer_id in range(n_layer) + } ), lm_head_norm=lm_head_norm_config.norm_type.value(**dict(lm_head_norm_config.config)), # NOTE: If we make the bias configurable, we must update the number of parameters calculation @@ -952,8 +952,8 @@ def forward_impl(self, inputs: torch.Tensor) -> torch.Tensor: # TODO: use drop out also without absolute position embedding? h = self.transformer.drop(h) if hasattr(self.transformer, "drop") else h - for block in self.transformer.h: - h = block(h) + for layer_id in self.transformer.h: + h = self.transformer.h[layer_id](h) h = self.transformer.lm_head_norm(h) if hasattr(self.transformer, "lm_head_norm") else h h = self.transformer.lm_head(h) if hasattr(self.transformer, "lm_head") else h return h From 554cd3943ad6c5832b842e3b738f5958c29f3fd8 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Tue, 21 Oct 2025 16:09:13 +0200 Subject: [PATCH 57/67] chore: Rename layer_id to layer_idx --- src/modalities/models/gpt2/gpt2_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py index f50a146e0..3e27ec5d5 100644 --- a/src/modalities/models/gpt2/gpt2_model.py +++ b/src/modalities/models/gpt2/gpt2_model.py @@ -847,7 +847,7 @@ def __init__( drop=nn.Dropout(dropout), h=nn.ModuleDict( { - str(layer_id): GPT2Block( + str(layer_idx): GPT2Block( n_embd=n_embd, bias=bias, n_head_q=n_head_q, @@ -863,7 +863,7 @@ def __init__( attention_norm=attention_norm_config.norm_type.value(**dict(attention_norm_config.config)), ffn_norm=ffn_norm_config.norm_type.value(**dict(ffn_norm_config.config)), ) - for layer_id in range(n_layer) + for layer_idx in range(n_layer) } ), lm_head_norm=lm_head_norm_config.norm_type.value(**dict(lm_head_norm_config.config)), @@ -952,8 +952,8 @@ def forward_impl(self, inputs: torch.Tensor) -> torch.Tensor: # TODO: use drop out also without absolute position embedding? h = self.transformer.drop(h) if hasattr(self.transformer, "drop") else h - for layer_id in self.transformer.h: - h = self.transformer.h[layer_id](h) + for layer_idx in self.transformer.h: + h = self.transformer.h[layer_idx](h) h = self.transformer.lm_head_norm(h) if hasattr(self.transformer, "lm_head_norm") else h h = self.transformer.lm_head(h) if hasattr(self.transformer, "lm_head") else h return h From 484815e5ae8765f53015ab836b8ce54f1cbbf0c2 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Tue, 21 Oct 2025 16:52:49 +0200 Subject: [PATCH 58/67] test: Adapt tests to new gpt2 model structure --- src/modalities/conversion/gpt2/conversion_model.py | 8 ++++---- .../activation_checkpointing.py | 4 ++-- .../test_fsdp2_dcp_checkpoint_loading_and_saving.py | 6 +++--- tests/conversion/gpt2/helper.py | 10 +++++----- tests/test_torch_compile.py | 11 +++++++---- 5 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/modalities/conversion/gpt2/conversion_model.py b/src/modalities/conversion/gpt2/conversion_model.py index 7b06e3ec0..89fbf194a 100644 --- a/src/modalities/conversion/gpt2/conversion_model.py +++ b/src/modalities/conversion/gpt2/conversion_model.py @@ -136,10 +136,10 @@ def _copy_weights_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM): modalities_model (GPT2LLM): The modalities model from which the weights will be copied. """ hf_model.model.embed_tokens.weight.data.copy_(modalities_model.transformer.wte.weight.data) - for hf_layer, modalities_layer in zip(hf_model.model.layers, modalities_model.transformer.h): - _copy_weights_attention(hf_layer, modalities_layer) - _copy_weights_mlp(hf_layer, modalities_layer) - _copy_weights_layer_norms(hf_layer, modalities_layer) + for hf_layer, modalities_layer_idx in zip(hf_model.model.layers, modalities_model.transformer.h): + _copy_weights_attention(hf_layer, modalities_model.transformer.h[modalities_layer_idx]) + _copy_weights_mlp(hf_layer, modalities_model.transformer.h[modalities_layer_idx]) + _copy_weights_layer_norms(hf_layer, modalities_model.transformer.h[modalities_layer_idx]) _copy_weights_base_modules(hf_model.lm_head, modalities_model.transformer.lm_head) _copy_weights_base_modules(hf_model.model.norm, modalities_model.transformer.lm_head_norm) diff --git a/src/modalities/training/activation_checkpointing/activation_checkpointing.py b/src/modalities/training/activation_checkpointing/activation_checkpointing.py index 3cecf192d..0c194c350 100644 --- a/src/modalities/training/activation_checkpointing/activation_checkpointing.py +++ b/src/modalities/training/activation_checkpointing/activation_checkpointing.py @@ -135,8 +135,8 @@ def apply_activation_checkpointing_( raise ValueError(f"Unknown activation checkpointing variant: {ac_variant}") layers = model.get_submodule(layers_fqn) - if not isinstance(layers, nn.ModuleList): - raise ValueError(f"layers_fqn {layers_fqn} does not reference a ModuleList") + if not isinstance(layers, nn.ModuleDict): + raise ValueError(f"layers_fqn {layers_fqn} does not reference a ModuleDict") print_rank_0(f"Applying activation checkpointing to {len(list(layers.named_children()))} layers...") diff --git a/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py b/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py index 482d0cde6..bcbdbd32b 100644 --- a/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py +++ b/tests/checkpointing/test_fsdp2_dcp_checkpoint_loading_and_saving.py @@ -114,7 +114,7 @@ def _test_save_checkpoint_after_backward_pass_impl_wrapper( global_rank=process_id, local_rank=process_id, world_size=world_size, - rdvz_port=22353, + rdvz_port=22354, ): try: # build all the components for the test @@ -248,8 +248,8 @@ def _test_save_checkpoint_after_backward_pass_impl( ) loaded_and_updated_model_parameters = CheckpointingTestUtils.clone_parameters(app_state1.model) - loaded_and_updated_optimizer_state_dict = deepcopy(app_state1.optimizer.state_dict()) - + loaded_and_updated_optimizer_state_dict = deepcopy(app_state1.optimizer.state_dict()) + # perform another forward pass and backward pass for the previous and the loaded model if hasattr(app_state1, "scheduled_pipeline"): try: diff --git a/tests/conversion/gpt2/helper.py b/tests/conversion/gpt2/helper.py index 328633ccb..99adbacbc 100644 --- a/tests/conversion/gpt2/helper.py +++ b/tests/conversion/gpt2/helper.py @@ -6,14 +6,14 @@ def check_same_weight_model(converted_model: GPT2ForCausalLM, modalities_model: GPT2LLM): - converted_model.to(device=modalities_model.transformer.h[0].attn.q_attn.weight.device) + converted_model.to(device=modalities_model.transformer.h["0"].attn.q_attn.weight.device) assert torch.equal(converted_model.model.embed_tokens.weight, modalities_model.transformer.wte.weight) - for i, (llama_layer, modalities_layer) in enumerate( + for i, (llama_layer, modalities_layer_idx) in enumerate( zip(converted_model.model.layers, modalities_model.transformer.h) ): - check_same_weight_attention(llama_layer, modalities_layer) - check_same_weight_mlp(llama_layer, modalities_layer) - check_same_weight_layer_norms(llama_layer, modalities_layer) + check_same_weight_attention(llama_layer, modalities_model.transformer.h[modalities_layer_idx]) + check_same_weight_mlp(llama_layer, modalities_model.transformer.h[modalities_layer_idx]) + check_same_weight_layer_norms(llama_layer, modalities_model.transformer.h[modalities_layer_idx]) check_same_weight_base_modules(converted_model.lm_head, modalities_model.transformer.lm_head) check_same_weight_base_modules(converted_model.model.norm, modalities_model.transformer.lm_head_norm) diff --git a/tests/test_torch_compile.py b/tests/test_torch_compile.py index fab2ed217..59ae6ecb9 100644 --- a/tests/test_torch_compile.py +++ b/tests/test_torch_compile.py @@ -1,3 +1,6 @@ + +import copy + import pytest import torch.nn as nn @@ -57,7 +60,7 @@ def gpt2_model(): def test_get_compiled_model_compiles_blocks(gpt2_model): - original_blocks = list(gpt2_model.transformer.h) + original_model = copy.deepcopy(gpt2_model) original_wte = gpt2_model.transformer.wte original_lm_head = gpt2_model.transformer.lm_head @@ -65,9 +68,9 @@ def test_get_compiled_model_compiles_blocks(gpt2_model): result_model = ModelFactory.get_compiled_model(gpt2_model, block_names, fullgraph=True) assert len(result_model.transformer.h) == 4, "Should still have four blocks" - for i, (original_block, new_block) in enumerate(zip(original_blocks, result_model.transformer.h)): - assert new_block is not original_block, f"Block {i} should be a compiled version" - assert isinstance(new_block, nn.Module), f"Block {i} should be an nn.Module" + for i, (original_block_idx, new_block_idx) in enumerate(zip(original_model.transformer.h, result_model.transformer.h)): + assert result_model.transformer.h[new_block_idx] is not original_model.transformer.h[original_block_idx], f"Block {i} should be a compiled version" + assert isinstance(result_model.transformer.h[new_block_idx], nn.Module), f"Block {i} should be an nn.Module" assert result_model.transformer.wte is original_wte, "Embedding layer should remain unchanged" assert result_model.transformer.lm_head is original_lm_head, "LM head should remain unchanged" assert result_model is gpt2_model, "Should return the same model instance" From ddb249b194e60ae8c277935853a81a3388902038 Mon Sep 17 00:00:00 2001 From: rrutmann Date: Tue, 21 Oct 2025 17:31:22 +0200 Subject: [PATCH 59/67] test: Adapt code to latest changes to pass tests --- src/modalities/models/model_factory.py | 6 +++--- tests/fsdp2_parallelization/test_tensor_parallelism.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/modalities/models/model_factory.py b/src/modalities/models/model_factory.py index d889c213a..7df9ba258 100644 --- a/src/modalities/models/model_factory.py +++ b/src/modalities/models/model_factory.py @@ -683,13 +683,13 @@ def get_gpt2_tensor_parallelized_model(model: GPT2LLM, device_mesh: DeviceMesh) desired_input_layouts=(Replicate(),), ), } - if isinstance(model.transformer.h[0].mlp, SwiGLU): + if isinstance(list(model.transformer.h.values())[0].mlp, SwiGLU): mlp_plan = { "mlp.W": ColwiseParallel(), "mlp.W_2": RowwiseParallel(output_layouts=Shard(1)), "mlp.V": ColwiseParallel(), } - elif isinstance(model.transformer.h[0].mlp, TransformerMLP): + elif isinstance(list(model.transformer.h.values())[0].mlp, TransformerMLP): mlp_plan = { "mlp.c_fc": ColwiseParallel(), "mlp.c_proj": RowwiseParallel(output_layouts=Shard(1)), @@ -701,7 +701,7 @@ def get_gpt2_tensor_parallelized_model(model: GPT2LLM, device_mesh: DeviceMesh) ) transformer_block_tp_plan.update(mlp_plan) - for transformer_block in model.transformer.h: + for transformer_block in model.transformer.h.values(): # override the number of q and kv heads if transformer_block.attn.n_head_q % tp_mesh.size() != 0: raise ValueError( diff --git a/tests/fsdp2_parallelization/test_tensor_parallelism.py b/tests/fsdp2_parallelization/test_tensor_parallelism.py index 449fdb996..ac6554124 100644 --- a/tests/fsdp2_parallelization/test_tensor_parallelism.py +++ b/tests/fsdp2_parallelization/test_tensor_parallelism.py @@ -117,11 +117,11 @@ def _test_tp_sharding_impl( # Ensure models use the correct MLP if activation_type == "gelu": - assert isinstance(fsdp2_model.transformer.h[0].mlp, TransformerMLP) - assert isinstance(tp_model.transformer.h[0].mlp, TransformerMLP) + assert isinstance(fsdp2_model.transformer.h["0"].mlp, TransformerMLP) + assert isinstance(tp_model.transformer.h["0"].mlp, TransformerMLP) elif activation_type == "swiglu": - assert isinstance(fsdp2_model.transformer.h[0].mlp, SwiGLU) - assert isinstance(tp_model.transformer.h[0].mlp, SwiGLU) + assert isinstance(fsdp2_model.transformer.h["0"].mlp, SwiGLU) + assert isinstance(tp_model.transformer.h["0"].mlp, SwiGLU) # Ensure models are sharded correctly assert "tp" in tp_model.transformer.wte.weight.device_mesh.mesh_dim_names From 51b7db4205401735d1f859b9e18312f8a919b928 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Tue, 21 Oct 2025 17:34:01 +0200 Subject: [PATCH 60/67] test(data): Added tests for distributed multi dim data sampling. --- tests/dataloader/distributed/mocks.py | 42 ++++++++++ .../test_distributed_multidim_dataloader.py | 84 +++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 tests/dataloader/distributed/mocks.py create mode 100644 tests/dataloader/distributed/test_distributed_multidim_dataloader.py diff --git a/tests/dataloader/distributed/mocks.py b/tests/dataloader/distributed/mocks.py new file mode 100644 index 000000000..cc3f044e2 --- /dev/null +++ b/tests/dataloader/distributed/mocks.py @@ -0,0 +1,42 @@ +import os + + +class MultiProcessingCudaEnvMock: + """Context manager to set the CUDA environment for distributed training.""" + + def __init__( + self, + global_rank: int, + local_rank: int, + world_size: int, + rdvz_port: int, + ) -> None: + self.global_rank = global_rank + self.local_rank = local_rank + self.world_size = world_size + self.rdvz_port = rdvz_port + self._original_env: dict[str, str | None] = {} + + def __enter__(self): + # Store original values + for key in ["MASTER_ADDR", "MASTER_PORT", "RANK", "LOCAL_RANK", "WORLD_SIZE"]: + self._original_env[key] = os.environ.get(key) + + # Set new environment variables + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(self.rdvz_port) + os.environ["RANK"] = str(self.global_rank) + os.environ["LOCAL_RANK"] = str(self.local_rank) + os.environ["WORLD_SIZE"] = str(self.world_size) + + # torch.cuda.set_device(local_rank) + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # Restore original environment variables + for key, value in self._original_env.items(): + if value is None: + os.environ.pop(key, None) + else: + os.environ[key] = value diff --git a/tests/dataloader/distributed/test_distributed_multidim_dataloader.py b/tests/dataloader/distributed/test_distributed_multidim_dataloader.py new file mode 100644 index 000000000..e3546b00c --- /dev/null +++ b/tests/dataloader/distributed/test_distributed_multidim_dataloader.py @@ -0,0 +1,84 @@ +import os +from unittest.mock import MagicMock + +import pytest +from torch.utils.data import BatchSampler + +from modalities.dataloader.dataloader_factory import DataloaderFactory +from modalities.dataloader.sampler_factory import SamplerFactory +from modalities.running_env.fsdp.device_mesh import ParallelismDegrees +from tests.dataloader.distributed.mocks import MultiProcessingCudaEnvMock +from tests.dataloader.dummy_sequential_dataset import TestDataset + + +@pytest.mark.parametrize("world_size, dp_degree", [(4, 2)]) +def test_distributed_multidim_dataloader_produces_same_data_on_connected_non_dp_ranks(world_size: int, dp_degree: int): + batches_on_rank = _build_batch_for_each_rank_combination(world_size, dp_degree) + + for dp_rank in range(dp_degree): + assert all( + batches_on_rank[(dp_rank, 0)] == batches_on_rank[(dp_rank, other_rank)] + for other_rank in range(1, world_size // dp_degree) + ), f"Batches on dp_rank {dp_rank} differ across other ranks." + + +@pytest.mark.parametrize("world_size, dp_degree", [(4, 2)]) +def test_distributed_multidim_dataloader_produces_different_data_on_different_dp_ranks(world_size: int, dp_degree: int): + batches_on_rank = _build_batch_for_each_rank_combination(world_size, dp_degree) + + for dp_rank1 in range(dp_degree): + for dp_rank2 in range(dp_rank1 + 1, dp_degree): + samples_dp_rank1 = sum(batches_on_rank[(dp_rank1, 0)], []) + samples_dp_rank2 = sum(batches_on_rank[(dp_rank2, 0)], []) + assert ( + len(set(samples_dp_rank1).intersection(samples_dp_rank2)) == 0 + ), f"Data samples on different data parallel ranks {dp_rank1} and {dp_rank2} should be disjoint." + + +def _build_batch_for_each_rank_combination(world_size: int, dp_degree: int): + return { + (dp_rank, other_rank): _load_data_for_ranks(dp_rank, other_rank, world_size, dp_degree) + for dp_rank, other_rank in _get_rank_combinations(world_size, dp_degree) + } + + +def _get_rank_combinations(world_size: int, dp_degree: int): + other_degree = world_size // dp_degree + return [(dp_rank, other_rank) for dp_rank in range(dp_degree) for other_rank in range(other_degree)] + + +def _load_data_for_ranks(dp_rank: int, other_rank: int, world_size: int, dp_degree: int): + global_rank = dp_rank * 2 + other_rank + with MultiProcessingCudaEnvMock( + global_rank=global_rank, + local_rank=other_rank, + world_size=world_size, + rdvz_port=22350, + ): + device_mesh = _build_device_mesh_mock(world_size, dp_degree, dp_rank, other_rank) + dataset = TestDataset(8) + sampler = SamplerFactory.create_resumable_distributed_multi_dim_sampler( + dataset=dataset, device_mesh=device_mesh, data_parallel_key=ParallelismDegrees.DP_SHARD + ) + batch_sampler = BatchSampler(sampler, batch_size=2, drop_last=True) + train_dataloader = DataloaderFactory.get_dataloader( + dataloader_tag="train", + dataset=dataset, + batch_sampler=batch_sampler, + collate_fn=None, + num_workers=2, + pin_memory=False, + ) + return [batch.tolist() for batch in train_dataloader] + + +def _build_device_mesh_mock(world_size: int, dp_degree: int, dp_rank: int, other_rank: int): + dp_device_mesh = MagicMock() + dp_device_mesh.size.return_value = dp_degree + dp_device_mesh.get_coordinate.return_value = [dp_rank] + other_device_mesh = MagicMock() + other_degree = world_size // dp_degree + other_device_mesh.size.return_value = int(os.environ["WORLD_SIZE"]) // other_degree + other_device_mesh.get_coordinate.return_value = [other_rank] + device_mesh_mock = {ParallelismDegrees.DP_SHARD.value: dp_device_mesh, "other": other_device_mesh} + return device_mesh_mock From a84db67e416e1450f3c472e9e2c33a62cc8073a4 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Thu, 23 Oct 2025 17:26:16 +0200 Subject: [PATCH 61/67] fix(parallelism): Use dp degree instead of world size in global_num_tokens_per_train_step computation. --- src/modalities/main.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/modalities/main.py b/src/modalities/main.py index 2f680cf71..6624d9f8b 100644 --- a/src/modalities/main.py +++ b/src/modalities/main.py @@ -111,16 +111,21 @@ def run(self, components: TrainingComponentsInstantiationModel): ) # Trainer + # FIXME replace by get_parallel_degree + if components.device_mesh is None: + num_pipeline_parallel_ranks = 1 + num_data_parallel_ranks = 1 + else: + num_pipeline_parallel_ranks = get_num_parallel_ranks(components.device_mesh, ParallelismDegrees.PP) + num_data_parallel_ranks = get_num_parallel_ranks( + components.device_mesh, ParallelismDegrees.DP_SHARD + ) * get_num_parallel_ranks(components.device_mesh, ParallelismDegrees.DP_REPLICATE) global_num_tokens_per_train_step = ( components.settings.step_profile.local_train_micro_batch_size * components.settings.step_profile.sequence_length * components.settings.step_profile.gradient_accumulation_steps - * components.settings.cuda_env.world_size + * num_data_parallel_ranks ) - if components.device_mesh is None: - num_pipeline_parallel_ranks = 1 - else: - num_pipeline_parallel_ranks = get_num_parallel_ranks(components.device_mesh, ParallelismDegrees.PP) trainer = Trainer( global_rank=components.settings.cuda_env.global_rank, progress_publisher=progress_publisher, From bdc684ee1507d31b5a045afa1c5138b1b88ece02 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Thu, 23 Oct 2025 17:28:32 +0200 Subject: [PATCH 62/67] fix(optimizer): Optimizer groups for with and without weight decay instead of per weight decay group. --- .../optimizers/optimizer_factory.py | 37 +++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/src/modalities/optimizers/optimizer_factory.py b/src/modalities/optimizers/optimizer_factory.py index 5a0ae2bdc..9d7af332b 100644 --- a/src/modalities/optimizers/optimizer_factory.py +++ b/src/modalities/optimizers/optimizer_factory.py @@ -156,14 +156,45 @@ def _create_optimizer_groups( f"model {type(model)} has no parameters with requires_grad=True (i.e., no traininable parameters)." ) - optimizer_groups = [ + optimizer_groups = _built_optimizer_groups_via_weight_decay_split( + weight_decay, weight_decay_groups_excluded, weight_decay_groups, params + ) + return optimizer_groups, ["with_weight_decay", "without_weight_decay"] + + +def _built_optimizer_groups_via_weight_decay_split( + weight_decay: float, + weight_decay_groups_excluded: list[str], + weight_decay_groups: dict[str, list[str]], + params: dict[str, nn.Parameter], +) -> OptimizerGroups: + params_per_weight_decay_groups: list[dict[str, object]] = [ { "params": _filter_params_for_weight_decay_group(params, regex_expressions=weight_decay_groups[group]), - "weight_decay": weight_decay if group not in weight_decay_groups_excluded else 0.0, + "exclude": group not in weight_decay_groups_excluded, } for group in weight_decay_groups.keys() ] - return optimizer_groups, weight_decay_groups.keys() + + optimizer_groups: OptimizerGroups = [ + { + "params": sum((p["params"] for p in params_per_weight_decay_groups if not p["exclude"]), []), + "weight_decay": weight_decay, + }, + { + "params": sum((p["params"] for p in params_per_weight_decay_groups if p["exclude"]), []), + "weight_decay": 0.0, + }, + ] + + if len(optimizer_groups[0]["params"]) == 0 or len(optimizer_groups[1]["params"]) == 0: + raise OptimizerError( + "One of the optimizer groups has zero parameters. " + "This indicates that the weight_decay_groups_excluded configuration is not compatible " + "with the configured pipeline stages." + ) + + return optimizer_groups def _filter_params_for_weight_decay_group( From 0a8ab53ba7b4d91e890646d509f6270ec85fb9a1 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Thu, 23 Oct 2025 17:33:47 +0200 Subject: [PATCH 63/67] test(parallelism): Added warmstart e2e test with fsdp2 + tp + pp. --- .../gpt2_train_num_steps_7_pp_tp.yaml | 364 ++++++++++++++++ .../gpt2_warm_start_from_step_4_pp_tp.yaml | 393 ++++++++++++++++++ .../test_fsdp2_warmstart_pp_tp.py | 276 ++++++++++++ 3 files changed, 1033 insertions(+) create mode 100644 tests/end2end_tests/gpt2_train_num_steps_7_pp_tp.yaml create mode 100644 tests/end2end_tests/gpt2_warm_start_from_step_4_pp_tp.yaml create mode 100644 tests/end2end_tests/test_fsdp2_warmstart_pp_tp.py diff --git a/tests/end2end_tests/gpt2_train_num_steps_7_pp_tp.yaml b/tests/end2end_tests/gpt2_train_num_steps_7_pp_tp.yaml new file mode 100644 index 000000000..00a93a9d0 --- /dev/null +++ b/tests/end2end_tests/gpt2_train_num_steps_7_pp_tp.yaml @@ -0,0 +1,364 @@ +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + paths: + checkpoint_saving_path: tmp/checkpoints + train_dataset_path: tests/end2end_tests/lorem_ipsum.pbin + intervals: + training_log_interval_in_steps: 1 + checkpointing_interval_in_steps: 4 + evaluation_interval_in_steps: 1 + consistency_enforcement: + enforce_tokens_per_step_consistency: false + enforce_last_step_logged: false + enforce_last_step_evaluated: false + enforce_last_step_checkpointed: false + step_profile: + gradient_accumulation_steps: 1 + local_train_micro_batch_size: 2 + sequence_length: 256 + training_target: + num_target_tokens: + component_key: number_conversion + variant_key: num_tokens_from_packed_mem_map_dataset_continuous + config: + dataset_path: ${settings.paths.train_dataset_path} + sequence_length: ${settings.step_profile.sequence_length} + num_ranks: 2 # FIXME: adapt to dp_parallel_degree + local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size} + gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps} + num_target_steps: # for the batch progress subscriber + component_key: number_conversion + variant_key: num_steps_from_num_tokens + config: + num_ranks: 2 # FIXME: adapt to dp_parallel_degree + local_micro_batch_size: ${settings.step_profile.local_train_micro_batch_size} + global_num_tokens: ${settings.training_target.num_target_tokens} + sequence_length: ${settings.step_profile.sequence_length} + gradient_accumulation_steps: ${settings.step_profile.gradient_accumulation_steps} + training_progress: + global_num_seen_tokens: 0 + num_seen_steps: 0 + num_seen_samples: 0 + last_step: -1 + +collate_fn: + component_key: collate_fn + variant_key: gpt_2_llm_collator + config: + sample_key: ${settings.referencing_keys.sample_key} + target_key: ${settings.referencing_keys.target_key} + +train_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: ${settings.paths.train_dataset_path} + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +train_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + dataloader_tag: train + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.step_profile.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: resumable_distributed_multi_dim_sampler + config: + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + data_parallel_key: dp_shard + shuffle: true + seed: 42 + drop_last: true + skip_num_global_samples: ${settings.training_progress.num_seen_samples} + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +eval_dataloaders: [] + +checkpoint_saving: + component_key: checkpoint_saving + variant_key: default + config: + checkpoint_saving_strategy: + component_key: checkpoint_saving_strategy + variant_key: save_k_most_recent_checkpoints_strategy + config: + k: -1 # -1 to save all checkpoints + checkpoint_saving_execution: + component_key: checkpoint_saving_execution + variant_key: dcp + config: + checkpoint_path: ${settings.paths.checkpoint_saving_path} + global_rank: ${settings.cuda_env.global_rank} + experiment_id: ${settings.experiment_id} + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +device_mesh: + component_key: device_mesh + variant_key: default + config: + device_type: cuda + data_parallel_replicate_degree: 1 + pipeline_parallel_degree: 2 + tensor_parallel_degree: 2 + data_parallel_shard_degree: -1 + world_size: ${settings.cuda_env.world_size} + +app_state: + component_key: app_state + variant_key: raw + config: + model: + instance_key: initialized_model + pass_type: BY_REFERENCE + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + lr_scheduler: + instance_key: lr_scheduler + pass_type: BY_REFERENCE + +initialized_model: + component_key: model + variant_key: model_initialized + config: + model: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: scheduled_pipeline + pass_type: BY_REFERENCE + selection_type: MODEL_PART + model_initializer: + component_key: model_initialization + variant_key: composed + config: + model_type: gpt2 + weight_init_type: scaled + mean: 0.0 + std: 0.02 + num_layers: ${model_raw.config.n_layer} + +scheduled_pipeline: + component_key: pipeline + variant_key: scheduled + config: + loss_fn: + instance_key: loss_fn + pass_type: BY_REFERENCE + pp_schedule_name: gpipe + batch_size: ${settings.step_profile.local_train_micro_batch_size} + microbatch_size: 1 + pp_degree: ${device_mesh.config.pipeline_parallel_degree} + # maybe better to use the fsdp model and the schedule here + # instead of passing in the staged pipeline? + # If fsdp_model creates a copy then this is not in the scope of + # the staged pipeline. + pipeline: + component_key: pipeline + variant_key: builder + config: + pp_stage: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: PP_STAGE + model_part: + instance_key: fsdp_model + pass_type: BY_REFERENCE + +fsdp_model: + component_key: model + variant_key: fsdp2_wrapped + config: + model: + instance_key: gpt2_tp_model + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + mixed_precision_settings: + param_dtype: BF_16 + reduce_dtype: BF_16 + block_names: [GPT2Block] + +gpt2_tp_model: + component_key: model + variant_key: gpt2_tp + config: + model: + instance_key: model_part + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + +model_part: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: MODEL_PART + +staged_pipeline: + component_key: pipeline + variant_key: staged + config: + whole_model: + instance_key: model_raw + pass_type: BY_REFERENCE + stages_generator: + component_key: stages_generator + variant_key: gpt2_stages_generator + config: + num_model_layers: ${model_raw.config.n_layer} + input_layer_equivalence: 1 + output_layer_equivalence: 1 + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + local_rank: ${settings.cuda_env.local_rank} + pp_schedule_name: gpipe + num_layers_per_stage: 2 + +model_raw: + component_key: model + variant_key: gpt2 + config: + seed: 42 + use_meta_device: true + use_weight_tying: false + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + sequence_length: ${settings.step_profile.sequence_length} + prediction_key: ${loss_fn.config.prediction_key} + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 2 + n_head_q: 8 + n_head_kv: 8 + ffn_hidden: 128 + n_embd: 128 + dropout: 0.0 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model_raw.config.n_embd} + n_head: ${model_raw.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + base_freq: 10000 + attention_implementation: manual + activation_type: swiglu + attention_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + ffn_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + lm_head_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + +lr_scheduler: + component_key: scheduler + variant_key: onecycle_lr + config: + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + max_lr: 6e-4 + div_factor: 10 + final_div_factor: 1 + total_steps: ${settings.training_target.num_target_steps} + pct_start: 0.01 + anneal_strategy: cos + last_epoch: ${settings.training_progress.last_step} + +optimizer: + component_key: optimizer + variant_key: adam_w + config: + lr: 0.0001 + betas: [0.9, 0.95] + eps: 1e-8 + weight_decay: 1e-1 + weight_decay_groups_excluded: [embedding, layernorm] + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + +gradient_clipper: + component_key: gradient_clipper + variant_key: fsdp2 + config: + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + norm_type: P2_NORM + max_norm: 1.0 + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + +progress_subscriber: + component_key: progress_subscriber + variant_key: rich + config: + global_rank: ${settings.cuda_env.global_rank} + num_seen_steps: ${settings.training_progress.num_seen_steps} + num_target_steps: ${settings.training_target.num_target_steps} + train_dataloader_tag: ${train_dataloader.config.dataloader_tag} + eval_dataloaders: + instance_key: eval_dataloaders + pass_type: BY_REFERENCE + +evaluation_subscriber: + component_key: results_subscriber + variant_key: save_all + config: {} diff --git a/tests/end2end_tests/gpt2_warm_start_from_step_4_pp_tp.yaml b/tests/end2end_tests/gpt2_warm_start_from_step_4_pp_tp.yaml new file mode 100644 index 000000000..caea5ba49 --- /dev/null +++ b/tests/end2end_tests/gpt2_warm_start_from_step_4_pp_tp.yaml @@ -0,0 +1,393 @@ +settings: + experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} + referencing_keys: + sample_key: input_ids + target_key: target_ids + prediction_key: logits + cuda_env: + local_rank: ${cuda_env:LOCAL_RANK} + global_rank: ${cuda_env:RANK} + world_size: ${cuda_env:WORLD_SIZE} + paths: + checkpoint_saving_path: tmp/checkpoints + train_dataset_path: tests/end2end_tests/lorem_ipsum.pbin + intervals: + training_log_interval_in_steps: 1 + checkpointing_interval_in_steps: 2 + evaluation_interval_in_steps: 1 + consistency_enforcement: + enforce_tokens_per_step_consistency: false + enforce_last_step_logged: false + enforce_last_step_evaluated: false + enforce_last_step_checkpointed: false + step_profile: + gradient_accumulation_steps: 1 + local_train_micro_batch_size: 2 + sequence_length: 256 + training_target: + num_target_tokens: + component_key: number_conversion + variant_key: global_num_target_tokens_from_checkpoint_path + config: + checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path} + num_target_steps: # for the batch progress subscriber + component_key: number_conversion + variant_key: num_target_steps_from_checkpoint_path + config: + checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path} + training_progress: + global_num_seen_tokens: # used below + component_key: number_conversion + variant_key: global_num_seen_tokens_from_checkpoint_path + config: + checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path} + num_seen_steps: # for the batch progress subscriber + component_key: number_conversion + variant_key: num_seen_steps_from_checkpoint_path + config: + checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path} + num_seen_samples: + component_key: number_conversion + variant_key: num_samples_from_num_tokens + config: + num_tokens: ${settings.training_progress.global_num_seen_tokens} + sequence_length: ${settings.step_profile.sequence_length} + last_step: # for the scheduler + component_key: number_conversion + variant_key: last_step_from_checkpoint_path + config: + checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path} + warmstart_checkpoint_paths: + # we pass in the checkpoint paths as filenames such that the num_target_tokens and num_target_steps can be calculated and correctly passed to the training loop + # Within the test is replaced with the actual path to the checkpoint. + model_checkpoint_path: eid_0-seen_steps_4-seen_tokens_4096-target_steps_7-target_tokens_7168 + optimizer_checkpoint_path: eid_0-seen_steps_4-seen_tokens_4096-target_steps_7-target_tokens_7168 + +collate_fn: + component_key: collate_fn + variant_key: gpt_2_llm_collator + config: + sample_key: ${settings.referencing_keys.sample_key} + target_key: ${settings.referencing_keys.target_key} + +train_dataset: + component_key: dataset + variant_key: packed_mem_map_dataset_continuous + config: + raw_data_path: ${settings.paths.train_dataset_path} + sequence_length: ${settings.step_profile.sequence_length} + sample_key: ${settings.referencing_keys.sample_key} + +train_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + dataloader_tag: train + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: ${settings.step_profile.local_train_micro_batch_size} + drop_last: true + sampler: + component_key: sampler + variant_key: resumable_distributed_multi_dim_sampler + config: + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + data_parallel_key: dp_shard + shuffle: true + seed: 42 + drop_last: true + skip_num_global_samples: ${settings.training_progress.num_seen_samples} + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +eval_dataloaders: [] + +# checkpoint_loading: +# component_key: checkpoint_loading +# variant_key: dcp +# config: +# global_rank: ${settings.cuda_env.global_rank} + +checkpoint_saving: + component_key: checkpoint_saving + variant_key: default + config: + checkpoint_saving_strategy: + component_key: checkpoint_saving_strategy + variant_key: save_k_most_recent_checkpoints_strategy + config: + k: -1 # -1 to save all checkpoints + checkpoint_saving_execution: + component_key: checkpoint_saving_execution + variant_key: dcp + config: + checkpoint_path: ${settings.paths.checkpoint_saving_path} + global_rank: ${settings.cuda_env.global_rank} + experiment_id: ${settings.experiment_id} + +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss + config: + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} + +device_mesh: + component_key: device_mesh + variant_key: default + config: + device_type: cuda + data_parallel_replicate_degree: 1 + pipeline_parallel_degree: 2 + tensor_parallel_degree: 2 + data_parallel_shard_degree: -1 + world_size: ${settings.cuda_env.world_size} + +app_state: + component_key: app_state + variant_key: dcp + config: + raw_app_state: + instance_key: app_state_raw + pass_type: BY_REFERENCE + checkpoint_dir_path: checkpoint/path/will/be/set/in/code + +app_state_raw: + component_key: app_state + variant_key: raw + config: + model: + instance_key: initialized_model + pass_type: BY_REFERENCE + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + lr_scheduler: + instance_key: lr_scheduler + pass_type: BY_REFERENCE + +initialized_model: + component_key: model + variant_key: model_initialized + config: + model: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: scheduled_pipeline + pass_type: BY_REFERENCE + selection_type: MODEL_PART + model_initializer: + component_key: model_initialization + variant_key: composed + config: + model_type: gpt2 + weight_init_type: scaled + mean: 0.0 + std: 0.02 + num_layers: ${model_raw.config.n_layer} + +scheduled_pipeline: + component_key: pipeline + variant_key: scheduled + config: + loss_fn: + instance_key: loss_fn + pass_type: BY_REFERENCE + pp_schedule_name: gpipe + batch_size: ${settings.step_profile.local_train_micro_batch_size} + microbatch_size: 1 + pp_degree: ${device_mesh.config.pipeline_parallel_degree} + # maybe better to use the fsdp model and the schedule here + # instead of passing in the staged pipeline? + # If fsdp_model creates a copy then this is not in the scope of + # the staged pipeline. + pipeline: + component_key: pipeline + variant_key: builder + config: + pp_stage: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: PP_STAGE + model_part: + instance_key: fsdp_model + pass_type: BY_REFERENCE + +fsdp_model: + component_key: model + variant_key: fsdp2_wrapped + config: + model: + instance_key: gpt2_tp_model + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + mixed_precision_settings: + param_dtype: BF_16 + reduce_dtype: BF_16 + block_names: [GPT2Block] + +gpt2_tp_model: + component_key: model + variant_key: gpt2_tp + config: + model: + instance_key: model_part + pass_type: BY_REFERENCE + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + +model_part: + component_key: pipeline + variant_key: selector + config: + pipeline: + instance_key: staged_pipeline + pass_type: BY_REFERENCE + selection_type: MODEL_PART + +staged_pipeline: + component_key: pipeline + variant_key: staged + config: + whole_model: + instance_key: model_raw + pass_type: BY_REFERENCE + stages_generator: + component_key: stages_generator + variant_key: gpt2_stages_generator + config: + num_model_layers: ${model_raw.config.n_layer} + input_layer_equivalence: 1 + output_layer_equivalence: 1 + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + local_rank: ${settings.cuda_env.local_rank} + pp_schedule_name: gpipe + num_layers_per_stage: 2 + +model_raw: + component_key: model + variant_key: gpt2 + config: + seed: 42 + use_meta_device: true + use_weight_tying: false + sample_key: ${settings.referencing_keys.sample_key} + poe_type: NOPE + sequence_length: ${settings.step_profile.sequence_length} + prediction_key: ${loss_fn.config.prediction_key} + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 2 + n_head_q: 8 + n_head_kv: 8 + ffn_hidden: 128 + n_embd: 128 + dropout: 0.0 + bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model_raw.config.n_embd} + n_head: ${model_raw.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + base_freq: 10000 + attention_implementation: manual + activation_type: swiglu + attention_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + ffn_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + lm_head_norm_config: + norm_type: layer_norm + config: + normalized_shape: ${model_raw.config.n_embd} + eps: 1e-5 + +lr_scheduler: + component_key: scheduler + variant_key: onecycle_lr + config: + optimizer: + instance_key: optimizer + pass_type: BY_REFERENCE + max_lr: 6e-4 + div_factor: 10 + final_div_factor: 1 + total_steps: ${settings.training_target.num_target_steps} + pct_start: 0.01 + anneal_strategy: cos + # last_epoch: ${settings.training_progress.last_step} # Not required. App state will take care of the correct initialization. + +optimizer: + component_key: optimizer + variant_key: adam_w + config: + lr: 0.0001 + betas: [0.9, 0.95] + eps: 1e-8 + weight_decay: 1e-1 + weight_decay_groups_excluded: [embedding, layernorm] + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + +gradient_clipper: + component_key: gradient_clipper + variant_key: fsdp2 + config: + wrapped_model: + instance_key: initialized_model + pass_type: BY_REFERENCE + norm_type: P2_NORM + max_norm: 1.0 + device_mesh: + instance_key: device_mesh + pass_type: BY_REFERENCE + +progress_subscriber: + component_key: progress_subscriber + variant_key: rich + config: + global_rank: ${settings.cuda_env.global_rank} + num_seen_steps: ${settings.training_progress.num_seen_steps} + num_target_steps: ${settings.training_target.num_target_steps} + train_dataloader_tag: ${train_dataloader.config.dataloader_tag} + eval_dataloaders: + instance_key: eval_dataloaders + pass_type: BY_REFERENCE + +evaluation_subscriber: + component_key: results_subscriber + variant_key: save_all + config: {} diff --git a/tests/end2end_tests/test_fsdp2_warmstart_pp_tp.py b/tests/end2end_tests/test_fsdp2_warmstart_pp_tp.py new file mode 100644 index 000000000..63d20d8ba --- /dev/null +++ b/tests/end2end_tests/test_fsdp2_warmstart_pp_tp.py @@ -0,0 +1,276 @@ +import json +import logging +import os +import re +import shutil +import traceback +from pathlib import Path + +import pytest +import torch +import torch.distributed as dist +from pydantic import BaseModel + +from modalities.__main__ import Main, load_app_config_dict +from modalities.batch import EvaluationResultBatch +from modalities.config.config import ProcessGroupBackendType, PydanticLLMDataLoaderIFType +from modalities.config.instantiation_models import TrainingComponentsInstantiationModel +from modalities.dataloader.dataloader import LLMDataLoader +from modalities.logging_broker.messages import Message +from modalities.running_env.cuda_env import CudaEnv +from tests.end2end_tests.custom_components import SaveAllResultSubscriber, SaveAllResultSubscriberConfig + + +def extract_seen_steps_and_tokens(filename: str) -> tuple[int, int]: + pattern = r"seen_steps_(\d+)-seen_tokens_(\d+)" + match = re.search(pattern, filename) + return int(match.group(1)), int(match.group(2)) + + +# NOTE: We need to run the tests in a torch distributed environment with at eight GPUs. +# CUDA_VISIBLE_DEVICES=0,1,2,4,5,6,7 torchrun --rdzv-endpoint localhost:29502 --nnodes 1 --nproc_per_node 8 \ +# $(which pytest) path/to/test_fsdp_to_disc_checkpointing.py + +# NOTE that we can only run one test at time due to NCCL issues with multiple tests in parallel. +# You can specify the test to run with the -k flag, e.g.: -k test_warm_start + + +working_dir = Path(os.path.dirname(__file__)) +tmp_folder = working_dir / "../tmp/fsdp2_warmstart_pp_tp" + + +class TrainDataloaderInstantiationModel(BaseModel): + settings: TrainingComponentsInstantiationModel.Settings + train_dataloader: PydanticLLMDataLoaderIFType + + +@pytest.mark.skipif( + "RANK" not in os.environ or torch.cuda.device_count() < 8, + reason="This e2e test requires 8 GPUs and a torchrun distributed environment.", +) +class TestWarmstart: + @staticmethod + def get_loss_scores(messages: list[Message[EvaluationResultBatch]], loss_key: str) -> list[float]: + return [message.payload.losses[loss_key].value.item() for message in messages] + + def test_warm_start(self): + # We want to verify that the training continues after starting from checkpoint (i.e, warm start) + # exactly the same way, as if we trained it from scratch. + # To do so, we have two configs. The first config trains a model for 8 steps and + # saves multiple intermediary checkpoints. + # The second config starts from the 4th step and trains the model for 4 more steps. + # We compare the loss values of the two models after 4 steps and expect them to be the same. + + try: + if tmp_folder.exists(): + shutil.rmtree(tmp_folder) + tmp_folder.mkdir(parents=False, exist_ok=False) + # config for two steps model + gpt2_8_steps_config_file_path = working_dir / "gpt2_train_num_steps_7_pp_tp.yaml" + gpt2_8_steps_config_dict = load_app_config_dict(gpt2_8_steps_config_file_path, experiment_id="0") + + # adopt the checkpoint path + checkpoint_path = str(tmp_folder) + gpt2_8_steps_config_dict["checkpoint_saving"]["config"]["checkpoint_saving_execution"]["config"][ + "checkpoint_path" + ] = checkpoint_path + gpt2_8_steps_config_dict["settings"]["paths"]["checkpoint_saving_path"] = checkpoint_path + loss_values_experiment_0_path = checkpoint_path + "/experiment_0_loss_scores.txt" + + # config for one step model + gpt2_warm_start_after_4_steps_config_file_path = working_dir / "gpt2_warm_start_from_step_4_pp_tp.yaml" + gpt2_warm_start_after_4_steps_dict = load_app_config_dict( + gpt2_warm_start_after_4_steps_config_file_path, experiment_id="1" + ) + + # adopt the checkpoint path + gpt2_warm_start_after_4_steps_dict["app_state"]["config"]["checkpoint_dir_path"] = ( + checkpoint_path + "/0/eid_0-seen_steps_4-seen_tokens_4096-target_steps_7-target_tokens_7168" + ) + gpt2_warm_start_after_4_steps_dict["checkpoint_saving"]["config"]["checkpoint_saving_execution"]["config"][ + "checkpoint_path" + ] = checkpoint_path + gpt2_warm_start_after_4_steps_dict["settings"]["paths"]["checkpoint_saving_path"] = checkpoint_path + loss_values_experiment_1_path = checkpoint_path + "/experiment_1_loss_scores.txt" + + with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl): + try: + main_obj_0 = Main(gpt2_8_steps_config_file_path) + main_obj_0.config_dict = gpt2_8_steps_config_dict + main_obj_0.add_custom_component( + component_key="results_subscriber", + variant_key="save_all", + custom_component=SaveAllResultSubscriber, + custom_config=SaveAllResultSubscriberConfig, + ) + print( + main_obj_0.config_dict["settings"]["training_target"]["num_target_tokens"]["config"][ + "dataset_path" + ] + ) + components_0 = main_obj_0.build_components( + components_model_type=TrainingComponentsInstantiationModel + ) + main_obj_0.run(components_0) + + # we collect the loss values from rank 0 and store them in the temporary experiment folder + if dist.get_rank() == 0: + messages_0: list[ + Message[EvaluationResultBatch] + ] = components_0.evaluation_subscriber.message_list + loss_scores_0 = TestWarmstart.get_loss_scores(messages_0, "train loss avg") + with open(loss_values_experiment_0_path, "w") as f: + json.dump(loss_scores_0, f) + + # make sure that the checkpoints have been written and checkpoint info file has been updated + checkpoint_info_file_path = Path(checkpoint_path) / "0/last_checkpoint_info.json" + assert checkpoint_info_file_path.exists() + with open(checkpoint_info_file_path, "r") as f: + checkpoint_info = json.load(f) + assert checkpoint_info["checkpoint_folder_path"] == ( + checkpoint_path + "/0/eid_0-seen_steps_4-seen_tokens_4096-target_steps_7-target_tokens_7168" + ) + assert Path(checkpoint_info["checkpoint_folder_path"]).exists() + + checkpoint_paths = list(Path(checkpoint_path).glob("**/*seen_steps_*-seen_tokens_*")) + assert len(checkpoint_paths) > 0 + max_seen_steps = -1 + max_seen_tokens = -1 + for checkpoint_path in checkpoint_paths: + seen_steps, seen_tokens = extract_seen_steps_and_tokens(checkpoint_path.name) + max_seen_steps = max(max_seen_steps, seen_steps) + max_seen_tokens = max(max_seen_tokens, seen_tokens) + + cp_info_model_seen_steps, cp_info_model_seen_tokens = extract_seen_steps_and_tokens( + checkpoint_info["checkpoint_folder_path"] + ) + + assert cp_info_model_seen_steps == max_seen_steps + assert cp_info_model_seen_tokens == max_seen_tokens + + main_obj_1 = Main(gpt2_warm_start_after_4_steps_config_file_path) + main_obj_1.config_dict = gpt2_warm_start_after_4_steps_dict + + main_obj_1.add_custom_component( + component_key="results_subscriber", + variant_key="save_all", + custom_component=SaveAllResultSubscriber, + custom_config=SaveAllResultSubscriberConfig, + ) + components_1 = main_obj_1.build_components( + components_model_type=TrainingComponentsInstantiationModel + ) + + assert ( + components_0.app_state.lr_scheduler.base_lrs == components_1.app_state.lr_scheduler.base_lrs + ) # make sure that the initial learning rates are the same + assert components_1.app_state.lr_scheduler.last_epoch == 4 # we start from step 4 + + main_obj_1.run(components_1) + + # we collect the loss values from rank 0 for the warmstart model + # and store them in the temporary experiment folder + if dist.get_rank() == 0: + messages_1: list[ + Message[EvaluationResultBatch] + ] = components_1.evaluation_subscriber.message_list + loss_scores_1 = TestWarmstart.get_loss_scores(messages_1, "train loss avg") + with open(loss_values_experiment_1_path, "w") as f: + json.dump(loss_scores_1, f) + + # read the losses from disc + # note that the temporary directory is only correct for the rank 0. + # rank 1 has a different one and we don't store anything in there + with open(loss_values_experiment_0_path, "r") as f: + loaded_loss_values_0 = json.load(f) + + with open(loss_values_experiment_1_path, "r") as f: + loaded_loss_values_1 = json.load(f) + + # we check if the losses for the model from scratch + # and the warm start model have the same loss values + assert loaded_loss_values_0[4:] == pytest.approx(loaded_loss_values_1, abs=1e-16) + + # assert that the scheduler state is the same for both models + assert ( + components_1.app_state.lr_scheduler.last_epoch == components_0.app_state.lr_scheduler.last_epoch + ) + assert ( + components_0.app_state.lr_scheduler.get_last_lr() + == components_1.app_state.lr_scheduler.get_last_lr() + ) + except Exception as e: + tb = traceback.format_exc() + logging.error(f"Exception in rank {os.environ.get('RANK', -1)}: {e}") + logging.error(tb) + raise + finally: + logging.info(f"Rank {os.environ.get('RANK', -1)} cleaning up.") + if int(os.environ.get("RANK", -1)) == 0: + try: + if tmp_folder.exists(): + shutil.rmtree(tmp_folder) + except Exception as e: + logging.warning(f"Rank {os.environ.get('RANK', -1)}: failed to remove tmp folder {tmp_folder}: {e}") + + def test_warmstart_dataloader(self): + # non-skipped config + gpt2_two_steps_config_file_path = working_dir / "gpt2_train_num_steps_8.yaml" + gpt2_two_steps_config_dict = load_app_config_dict(gpt2_two_steps_config_file_path, experiment_id="0") + + # skipped config + gpt2_warm_start_from_step_1_config_file_path = working_dir / "gpt2_warm_start_from_step_4.yaml" + gpt2_warm_start_from_step_1_dict = load_app_config_dict( + gpt2_warm_start_from_step_1_config_file_path, experiment_id="1" + ) + + with CudaEnv(process_group_backend=ProcessGroupBackendType.nccl): + main_obj_1 = Main(gpt2_two_steps_config_file_path) + main_obj_1.config_dict = gpt2_two_steps_config_dict + + main_obj_2 = Main(gpt2_warm_start_from_step_1_config_file_path) + main_obj_2.config_dict = gpt2_warm_start_from_step_1_dict + + main_obj_1.add_custom_component( + component_key="results_subscriber", + variant_key="save_all", + custom_component=SaveAllResultSubscriber, + custom_config=SaveAllResultSubscriberConfig, + ) + components_1: TrainDataloaderInstantiationModel = main_obj_1.build_components( + components_model_type=TrainDataloaderInstantiationModel + ) + dataloader_1: LLMDataLoader = components_1.train_dataloader + dl_1_samples = [s for s in dataloader_1] + + main_obj_2.add_custom_component( + component_key="results_subscriber", + variant_key="save_all", + custom_component=SaveAllResultSubscriber, + custom_config=SaveAllResultSubscriberConfig, + ) + components_2 = main_obj_2.build_components(components_model_type=TrainDataloaderInstantiationModel) + dataloader_2: LLMDataLoader = components_2.train_dataloader + dl_2_samples = [s for s in dataloader_2] + + # fast forward the first dataloader + + num_skip_steps = components_2.settings.training_progress.num_seen_steps + + # make sure that we actually skip as defined in the config + assert num_skip_steps == 4 + assert len(dl_1_samples) == num_skip_steps + len(dl_2_samples) + + # make sure that the first dataloader is not skipped + assert components_1.settings.training_progress.num_seen_steps == 0 + + # iterate through both sample lists from the dataloaders + # and assert the equality of the samples + + for i in range(len(dataloader_2)): + assert dl_1_samples[i + num_skip_steps].samples["input_ids"].equal(dl_2_samples[i].samples["input_ids"]) + + dl_1_samples[i + num_skip_steps].samples["input_ids"][-1] = 0 + assert not ( + dl_1_samples[i + num_skip_steps].samples["input_ids"].equal(dl_2_samples[i].samples["input_ids"]) + ) From 71d1f813d713e811ba46de0ea49c09dd188cb13a Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Fri, 24 Oct 2025 12:29:37 +0200 Subject: [PATCH 64/67] test(gradient_clipping): Check that gradient clipping in pp setting is identical to non stages model. --- tests/training/__init__.py | 0 tests/training/gradient_clipping/__init__.py | 0 .../test_fsdp_gradient_clipper.py | 168 ++++++++++++++++++ tests/utility.py | 9 + 4 files changed, 177 insertions(+) create mode 100644 tests/training/__init__.py create mode 100644 tests/training/gradient_clipping/__init__.py create mode 100644 tests/training/gradient_clipping/test_fsdp_gradient_clipper.py create mode 100644 tests/utility.py diff --git a/tests/training/__init__.py b/tests/training/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/training/gradient_clipping/__init__.py b/tests/training/gradient_clipping/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/training/gradient_clipping/test_fsdp_gradient_clipper.py b/tests/training/gradient_clipping/test_fsdp_gradient_clipper.py new file mode 100644 index 000000000..21a7e4a33 --- /dev/null +++ b/tests/training/gradient_clipping/test_fsdp_gradient_clipper.py @@ -0,0 +1,168 @@ +import tempfile +from multiprocessing import Queue + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +import torch.nn as nn + +from modalities.training.gradient_clipping.fsdp_gradient_clipper import FSDP2GradientClipper +from tests.utility import find_free_port + + +def test_pipeline_parallelized_clipping_equivalent_to_single_stage_clipping(): + max_norm = 0.1 + # create full model and initialize deterministically + torch.manual_seed(42) + full = FullModel() + + # create an input and compute gradients on the full model + x = torch.randn(2, 4) + out = full(x) + loss = out.pow(2).sum() + loss.backward() + + # save full model state and grads to a temporary file for workers + state = {} + for name, p in full.named_parameters(): + # store parameter data and grads on CPU + state[name] = p.data.cpu().clone() + grads = {} + for name, p in full.named_parameters(): + grads[name] = p.grad.cpu().clone() + + with tempfile.NamedTemporaryFile() as tmp: + store_path = tmp.name + torch.save({"state": state, "grads": grads}, store_path) + + # set up multiprocessing to simulate 2 pipeline stages + world_size = 2 + port = find_free_port() + q = mp.get_context("spawn").Queue() + mp.spawn(_worker, args=(world_size, store_path, port, max_norm, q), nprocs=world_size, join=True) + + # collect results + results = {} + for _ in range(world_size): + rank, coll = q.get() + results[rank] = coll + + # perform clipping on the full model (single-stage) + FSDP2GradientClipper.clip_grad_norm_( + parameters=full.parameters(), + max_norm=max_norm, + norm_type=2.0, + error_if_nonfinite=True, + foreach=True, + device_mesh=None, + ) + + # compare full model parts to the per-stage results + full_a_params = [p.data.cpu() for p in full.a.parameters()] + full_b_params = [p.data.cpu() for p in full.b.parameters()] + + # ranks: 0 -> partA, 1 -> partB + assert 0 in results and 1 in results + + for p_full, p_pp in zip(full_a_params, results[0]): + t_pp = torch.as_tensor(p_pp, dtype=p_full.dtype) + assert torch.allclose(p_full, t_pp, atol=1e-6, rtol=1e-5) + + for p_full, p_pp in zip(full_b_params, results[1]): + t_pp = torch.as_tensor(p_pp, dtype=p_full.dtype) + assert torch.allclose(p_full, t_pp, atol=1e-6, rtol=1e-5) + + +class PartA(nn.Module): + def __init__(self): + super().__init__() + self.lin = nn.Linear(4, 5, bias=False) + + def forward(self, x: torch.Tensor): + return self.lin(x) + + +class PartB(nn.Module): + def __init__(self): + super().__init__() + self.lin = nn.Linear(5, 3, bias=False) + + def forward(self, x: torch.Tensor): + return self.lin(x) + + +class FullModel(nn.Module): + def __init__(self): + super().__init__() + self.a = PartA() + self.b = PartB() + + def forward(self, x: torch.Tensor): + return self.b(self.a(x)) + + +def _worker(rank: int, world_size: int, store_path: str, port: int, max_norm: float, q: Queue): + # initialize distributed + dist.init_process_group(backend="gloo", init_method=f"tcp://127.0.0.1:{port}", rank=rank, world_size=world_size) + + # load saved full model state and grads + data = torch.load(store_path) + state = data["state"] + grads = data["grads"] + + # create the corresponding part for this rank and load weights + if rank == 0: + part = PartA() + # map parameters from full model: a.lin.weight + part.lin.weight.data.copy_(state["a.lin.weight"]) + # assign gradients + for name, p in part.named_parameters(): + full_name = f"a.{name}" + if full_name in grads: + p.grad = grads[full_name].clone() + else: + part = PartB() + part.lin.weight.data.copy_(state["b.lin.weight"]) + for name, p in part.named_parameters(): + full_name = f"b.{name}" + if full_name in grads: + p.grad = grads[full_name].clone() + + # create a dummy device_mesh-like object that matches the parts of DeviceMesh + # expected by get_mesh_for_parallelism_method and FSDP2GradientClipper. + class DummyPPMesh: + def __init__(self, group): + self._group = group + + def get_group(self): + return self._group + + class DummyDeviceMesh: + def __init__(self, group): + # include the PP mesh name so get_mesh_for_parallelism_method finds it + self.mesh_dim_names = ("pp",) + self._pp = DummyPPMesh(group) + + def __getitem__(self, name: str): + if name == "pp": + return self._pp + raise KeyError(name) + + mesh = DummyDeviceMesh(dist.group.WORLD) + + # call the clipping function which will perform all_reduce across the pp group + FSDP2GradientClipper.clip_grad_norm_( + parameters=part.parameters(), + max_norm=max_norm, + norm_type=2.0, + error_if_nonfinite=True, + foreach=True, + device_mesh=mesh, + ) + + # collect clipped parameter tensors (cpu) and serialize to plain Python lists + # to avoid multiprocessing shared-storage pickling issues. + collected = [p.data.cpu().numpy().tolist() for p in part.parameters()] + q.put((rank, collected)) + + dist.destroy_process_group() diff --git a/tests/utility.py b/tests/utility.py new file mode 100644 index 000000000..c839bc76f --- /dev/null +++ b/tests/utility.py @@ -0,0 +1,9 @@ +import socket + + +def find_free_port(): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.bind(("127.0.0.1", 0)) + port = s.getsockname()[1] + s.close() + return port From c0cf6a2f1702c18ce14d91bb4704ef343df5468e Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Fri, 24 Oct 2025 15:28:56 +0200 Subject: [PATCH 65/67] refactor(gradient_clipping): Removed duplicate code in fsdp2 gradient clipping. Also: - Removed DummyGradientClipper which is not used for anything. - Moved all gradient clipping tests to single file. --- src/modalities/registry/components.py | 3 - .../fsdp_gradient_clipper.py | 177 +++++++----------- .../fsdp_gradient_clipper_config.py | 16 -- tests/test_gradient_clipping.py | 160 ---------------- .../test_fsdp_gradient_clipper.py | 168 +++++++++++++++-- 5 files changed, 224 insertions(+), 300 deletions(-) delete mode 100644 tests/test_gradient_clipping.py diff --git a/src/modalities/registry/components.py b/src/modalities/registry/components.py index 9a3a3c46a..d56946060 100644 --- a/src/modalities/registry/components.py +++ b/src/modalities/registry/components.py @@ -104,14 +104,12 @@ from modalities.running_env.fsdp.device_mesh import DeviceMeshConfig, get_device_mesh from modalities.tokenization.tokenizer_wrapper import PreTrainedHFTokenizer, PreTrainedSPTokenizer from modalities.training.gradient_clipping.fsdp_gradient_clipper import ( - DummyGradientClipper, FSDP1GradientClipper, FSDP1LoggingOnlyGradientClipper, FSDP2GradientClipper, FSDP2LoggingOnlyGradientClipper, ) from modalities.training.gradient_clipping.fsdp_gradient_clipper_config import ( - DummyGradientClipperConfig, FSDP1DummyGradientClipperConfig, FSDP1GradientClipperConfig, FSDP2DummyGradientClipperConfig, @@ -335,7 +333,6 @@ class ComponentEntity: ComponentEntity( "gradient_clipper", "fsdp2_logging_only", FSDP2LoggingOnlyGradientClipper, FSDP2DummyGradientClipperConfig ), - ComponentEntity("gradient_clipper", "dummy", DummyGradientClipper, DummyGradientClipperConfig), # MFU calculators ComponentEntity("mfu_calculator", "gpt2", GPT2MFUCalculator, GPT2MFUCalculatorConfig), # Number conversion diff --git a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py index c4009cf41..129b8ad93 100644 --- a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py +++ b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper.py @@ -1,5 +1,5 @@ import math -from typing import Iterable, Optional +from typing import Optional import torch from torch import distributed as dist @@ -92,87 +92,56 @@ def clip_gradients(self) -> torch.Tensor: return gradient_norm_score -class FSDP2GradientClipper(GradientClipperIF): - """The FSDP2GradientClipper class that is responsible for clipping the gradients of a model wrapped with FSDP.""" +class FSDP2LoggingOnlyGradientClipper(GradientClipperIF): + """The FSDP2LoggingOnlyGradientClipper class that is responsible for logging the gradient + norms without actually clipping the gradients.""" def __init__( self, wrapped_model: FSDP2, - max_norm: float, - norm_type=GradientClippingMode, + norm_type: GradientClippingMode, device_mesh: Optional[DeviceMesh] = None, + error_if_nonfinite: bool = False, + foreach: Optional[bool] = None, ) -> None: """ - Initialize the FSDP2GradientClipper object. + Initialize the FSDP2LoggingOnlyGradientClipper. Args: - wrapped_model (FSDP2): The wrapped model. - max_norm (float): The maximum norm value for gradient clipping. - norm_type (GradientClippingMode, optional): The type of gradient clipping. Defaults to GradientClippingMode. + wrapped_model (FSDP2): The wrapped FSDP2 model. + norm_type (GradientClippingMode): The type of gradient clipping. device_mesh (DeviceMesh, optional): The device mesh used for distributed training. Defaults to None. + error_if_nonfinite (bool): if True, an error is thrown if the total + norm of the gradients from :attr:`parameters` is ``nan``, + ``inf``, or ``-inf``. Default: False (will switch to True in the future) + foreach (bool): use the faster foreach-based implementation. + If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently + fall back to the slow implementation for other device types. + Default: ``None`` Returns: None """ self.wrapped_model = wrapped_model - self.max_norm = max_norm self.norm_type = norm_type self.device_mesh = device_mesh + self.error_if_nonfinite = error_if_nonfinite + self.foreach = foreach @torch.no_grad() def clip_gradients(self) -> torch.Tensor: """ - Clips the gradients of the wrapped model using the specified maximum norm and norm type. - - Returns: - torch.Tensor: The gradient norm after clipping. - """ - gradient_norm_score = FSDP2GradientClipper.clip_grad_norm_( - parameters=self.wrapped_model.parameters(), - max_norm=self.max_norm, - norm_type=self.norm_type.value, - error_if_nonfinite=True, - foreach=True, - device_mesh=self.device_mesh, - ) - return gradient_norm_score - - @staticmethod - def clip_grad_norm_( - parameters: torch.Tensor | Iterable[torch.Tensor], - max_norm: float, - norm_type: float = 2.0, - error_if_nonfinite: bool = False, - foreach: Optional[bool] = None, - device_mesh: Optional[DeviceMesh] = None, - ) -> torch.Tensor: - """ - Clip the gradient norm of an iterable of parameters. - - Gradient norm clipping requires computing the gradient norm over the entire model. - `torch.nn.utils.clip_grad_norm_` only computes gradient norm along DP/FSDP/TP dimensions. - - Args: - parameters: an iterable of Tensors or a single Tensor that will have gradients normalized - max_norm (float): max norm of the gradients - norm_type (float): type of the used p-norm. Can be ``'inf'`` for - infinity norm. - error_if_nonfinite (bool): if True, an error is thrown if the total - norm of the gradients from :attr:`parameters` is ``nan``, - ``inf``, or ``-inf``. Default: False (will switch to True in the future) - foreach (bool): use the faster foreach-based implementation. - If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently - fall back to the slow implementation for other device types. - Default: ``None`` - device_mesh: device mesh + Returns the gradient norm, but does not apply clipping since max_norm is set to inifinity. Returns: - Total norm of the parameter gradients (viewed as a single vector). - + torch.Tensor: The gradient norms. """ - grads = [p.grad for p in parameters if p.grad is not None] + grads = [p.grad for p in self.wrapped_model.parameters() if p.grad is not None] total_norm = torch.nn.utils.get_total_norm( - tensors=grads, norm_type=norm_type, error_if_nonfinite=error_if_nonfinite, foreach=foreach + tensors=grads, + norm_type=self.norm_type.value, + error_if_nonfinite=self.error_if_nonfinite, + foreach=self.foreach, ) # Inspired by torch titan @@ -183,86 +152,74 @@ def clip_grad_norm_( if isinstance(total_norm, DTensor): # Will reach here if any non-PP parallelism is used. # If only using PP, total_norm will be a local tensor. - total_norm = total_norm.full_tensor() - pp_mesh = get_mesh_for_parallelism_method(device_mesh=device_mesh, parallelism_method=ParallelismDegrees.PP) + pp_mesh = get_mesh_for_parallelism_method( + device_mesh=self.device_mesh, parallelism_method=ParallelismDegrees.PP + ) if pp_mesh is not None: - if math.isinf(norm_type): + if math.isinf(self.norm_type.value): dist.all_reduce(total_norm, op=dist.ReduceOp.MAX, group=pp_mesh.get_group()) else: - total_norm **= norm_type + total_norm **= self.norm_type.value dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=pp_mesh.get_group()) - total_norm **= 1.0 / norm_type - - torch.nn.utils.clip_grads_with_norm_(parameters, max_norm, total_norm, foreach) + total_norm **= 1.0 / self.norm_type.value return total_norm -class FSDP2LoggingOnlyGradientClipper(GradientClipperIF): - """The FSDP2LoggingOnlyGradientClipper class that is responsible for logging the gradient - norms without actually clipping the gradients.""" +class FSDP2GradientClipper(FSDP2LoggingOnlyGradientClipper): + """The FSDP2GradientClipper class that is responsible for clipping the gradients of a model wrapped with FSDP.""" def __init__( - self, wrapped_model: FSDP2, norm_type=GradientClippingMode, device_mesh: Optional[DeviceMesh] = None + self, + wrapped_model: FSDP2, + max_norm: float, + norm_type: GradientClippingMode, + device_mesh: Optional[DeviceMesh] = None, + error_if_nonfinite: bool = False, + foreach: Optional[bool] = None, ) -> None: """ - Initialize the FSDP2LoggingOnlyGradientClipper. + Initialize the FSDP2GradientClipper object. Args: wrapped_model (FSDP2): The wrapped FSDP2 model. - norm_type (GradientClippingMode, optional): The type of gradient clipping. Defaults to GradientClippingMode. + max_norm (float): The maximum norm value for gradient clipping. + norm_type (GradientClippingMode): The type of gradient clipping. device_mesh (DeviceMesh, optional): The device mesh used for distributed training. Defaults to None. + error_if_nonfinite (bool): if True, an error is thrown if the total + norm of the gradients from :attr:`parameters` is ``nan``, + ``inf``, or ``-inf``. Default: False (will switch to True in the future) + foreach (bool): use the faster foreach-based implementation. + If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently + fall back to the slow implementation for other device types. + Default: ``None`` Returns: None """ - self.wrapped_model = wrapped_model - self.norm_type = norm_type - self.device_mesh = device_mesh + super().__init__( + wrapped_model=wrapped_model, + norm_type=norm_type, + device_mesh=device_mesh, + error_if_nonfinite=error_if_nonfinite, + foreach=foreach, + ) + self.max_norm = max_norm @torch.no_grad() def clip_gradients(self) -> torch.Tensor: """ - Returns the gradient norm, but does not apply clipping since max_norm is set to inifinity. + Clips the gradients of the wrapped model using the specified maximum norm and norm type. Returns: - torch.Tensor: The gradient norms. + torch.Tensor: The gradient norm after clipping. """ - grads = [p.grad for p in self.wrapped_model.parameters() if p.grad is not None] - total_norm = torch.nn.utils.get_total_norm( - tensors=grads, norm_type=self.norm_type.value, error_if_nonfinite=False, foreach=True - ) - if isinstance(total_norm, DTensor): - # Will reach here if any non-PP parallelism is used. - # If only using PP, total_norm will be a local tensor. - total_norm = total_norm.full_tensor() - - pp_mesh = get_mesh_for_parallelism_method( - device_mesh=self.device_mesh, parallelism_method=ParallelismDegrees.PP + total_norm = super().clip_gradients() + torch.nn.utils.clip_grads_with_norm_( + parameters=self.wrapped_model.parameters(), + max_norm=self.max_norm, + total_norm=total_norm, + foreach=self.foreach, ) - if pp_mesh is not None: - if math.isinf(self.norm_type.value): - dist.all_reduce(total_norm, op=dist.ReduceOp.MAX, group=pp_mesh.get_group()) - else: - total_norm **= self.norm_type.value - dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=pp_mesh.get_group()) - total_norm **= 1.0 / self.norm_type.value return total_norm - - -class DummyGradientClipper(GradientClipperIF): - """The DummyGradientClipper class that does not apply gradient clipping.""" - - def __init__(self) -> None: - pass - - def clip_gradients(self) -> torch.Tensor: - """ - Returns a tensor with value -1.0 indicating that DummyGradientClipper does not actually apply gradient clipping. - - Returns: - torch.Tensor: Tensor with value -1.0 - """ - gradient_norm_score = torch.Tensor([-1.0]) - return gradient_norm_score diff --git a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py index 310fb9b60..b19971d69 100644 --- a/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py +++ b/src/modalities/training/gradient_clipping/fsdp_gradient_clipper_config.py @@ -81,19 +81,3 @@ class FSDP2DummyGradientClipperConfig(FSDP1DummyGradientClipperConfig): wrapped_model: PydanticPytorchModuleType norm_type: GradientClippingMode device_mesh: PydanticDeviceMeshIFType | None = None - - -class DummyGradientClipperConfig(BaseModel): - """ - Configuration class for dummy gradient clipper. - - This class is a placeholder and does not have any specific functionality. - - Attributes: - None - - Methods: - None - """ - - pass diff --git a/tests/test_gradient_clipping.py b/tests/test_gradient_clipping.py deleted file mode 100644 index 14ff0e7d8..000000000 --- a/tests/test_gradient_clipping.py +++ /dev/null @@ -1,160 +0,0 @@ -import types -from unittest.mock import MagicMock - -import torch - -from modalities.training.gradient_clipping.fsdp_gradient_clipper import ( - DummyGradientClipper, - FSDP1GradientClipper, - FSDP1LoggingOnlyGradientClipper, - FSDP2GradientClipper, - FSDP2LoggingOnlyGradientClipper, - GradientClippingMode, -) - - -class MockFSDPModel: - def __init__(self): - self.param1 = torch.nn.Parameter(torch.tensor([1.0, 2.0])) - self.param2 = torch.nn.Parameter(torch.tensor([3.0, 4.0])) - self.param1.grad = torch.tensor([1.0, 1.0]) - self.param2.grad = torch.tensor([1.0, 1.0]) - - def parameters(self): - return [self.param1, self.param2] - - -# Test for FSDP1 gradient clipper -def test_fsdp1_gradient_clipper(): - """ - Test FSDP1GradientClipper's ability to clip gradients correctly. - Uses a mock model with a dynamically added clip_grad_norm_ method to verify norm calculation and gradient scaling. - """ - mock_model = MockFSDPModel() - max_norm = 1.0 - norm_type = GradientClippingMode.P2_NORM - - # Note: FSDPGradientClipper requires clip_grad_norm_, but user's model lacks it. - # To use FSDPGradientClipper, we’d need to add this method, which deviates from the request. - # For strict adherence, we could skip this test or raise an error, but let’s adapt. - # Temporarily extend MockFSDPModel in this test (with a comment explaining). - def clip_grad_norm_(self, max_norm, norm_type): - params = [p for p in self.parameters() if p.grad is not None] - total_norm = torch.norm(torch.stack([torch.norm(p.grad, norm_type) for p in params]), norm_type) - clip_coef = max_norm / (total_norm + 1e-6) - if clip_coef < 1: - for p in params: - p.grad.data.mul_(clip_coef) - return total_norm - - # Dynamically add the method for this test - mock_model.clip_grad_norm_ = types.MethodType(clip_grad_norm_, mock_model) - - clipper = FSDP1GradientClipper(wrapped_model=mock_model, max_norm=max_norm, norm_type=norm_type) - norm = clipper.clip_gradients() - - # Expected norm before clipping: sqrt(1^2 + 1^2 + 1^2 + 1^2) = 2.0 - expected_norm = torch.tensor(2.0) - assert torch.allclose(norm, expected_norm), f"Expected norm {expected_norm}, got {norm}" - - # Gradients should be scaled to max_norm / total_norm = 1.0 / 2.0 = 0.5 - expected_grad = torch.tensor([0.5, 0.5]) - for param in mock_model.parameters(): - assert torch.allclose(param.grad, expected_grad), f"Expected grad {expected_grad}, got {param.grad}" - - -def test_fsdp1_logging_only_gradient_clipper(): - """ - Test that FSDP1LoggingOnlyGradientClipper calls clip_grad_norm_ with max_norm=torch.inf, - ensuring no clipping occurs, and returns the gradient norm. - """ - # Create a mock FSDP1 model - mock_model = MagicMock() - norm_type = GradientClippingMode.P2_NORM - clipper = FSDP1LoggingOnlyGradientClipper(wrapped_model=mock_model, norm_type=norm_type) - - # Call clip_gradients - clipper.clip_gradients() - - # Verify that clip_grad_norm_ was called with max_norm=torch.inf - mock_model.clip_grad_norm_.assert_called_once_with(max_norm=torch.inf, norm_type=norm_type.value) - - -def test_fsdp2_clip_grad_norm(): - """ - Test the static clip_grad_norm_ method in FSDP2GradientClipper to ensure it correctly - computes the gradient norm and clips gradients when necessary. - """ - # Create parameters with gradients - mock_model = MockFSDPModel() - - # Compute expected total norm (Euclidean norm, norm_type=2) - expected_norm = (1**2 + 1**2 + 1**2 + 1**2) ** 0.5 # sqrt(4) = 2.0 - - # Test case 1: max_norm > total_norm (no clipping) - max_norm = expected_norm + 1 # 3.0 - norm = FSDP2GradientClipper.clip_grad_norm_(parameters=mock_model.parameters(), max_norm=max_norm, norm_type=2.0) - assert torch.allclose(norm, torch.tensor(expected_norm)), "Norm should match expected total norm" - assert torch.allclose(mock_model.param1.grad, torch.tensor([1.0, 1.0])), "Gradients should not be clipped" - assert torch.allclose(mock_model.param2.grad, torch.tensor([1.0, 1.0])), "Gradients should not be clipped" - - # Test case 2: max_norm < total_norm (clipping occurs) - max_norm = expected_norm / 2 # 1.0 - norm = FSDP2GradientClipper.clip_grad_norm_(parameters=mock_model.parameters(), max_norm=max_norm, norm_type=2.0) - assert torch.allclose(norm, torch.tensor(expected_norm)), "Norm should match pre-clipping total norm" - scale = max_norm / expected_norm # 1.0 / 2.0 = 0.5 - expected_grad = torch.tensor([1.0 * scale, 1.0 * scale]) - assert torch.allclose(mock_model.param1.grad, expected_grad), "Gradients should be clipped" - assert torch.allclose(mock_model.param2.grad, expected_grad), "Gradients should be clipped" - - -def test_fsdp2_gradient_clipper(): - """ - Test that FSDP2GradientClipper correctly calls clip_grad_norm_ on the wrapped model's parameters. - """ - # Create a mock FSDP2 model with parameters - - mock_model = MockFSDPModel() - - max_norm = 1.0 - norm_type = GradientClippingMode.P2_NORM - clipper = FSDP2GradientClipper(wrapped_model=mock_model, max_norm=max_norm, norm_type=norm_type) - - # Call clip_gradients - norm = clipper.clip_gradients() - - expected_norm = (1**2 + 1**2 + 1**2 + 1**2) ** 0.5 # 2.0 - assert torch.allclose(norm, torch.tensor(expected_norm)), "Norm should match expected total norm" - - scale = max_norm / expected_norm # 0.5 - expected_grad = torch.tensor([1.0 * scale, 1.0 * scale]) - for param in mock_model.parameters(): - assert torch.allclose(param.grad, expected_grad), "Gradients should be clipped" - - -def test_fsdp2_logging_only_gradient_clipper(): - """ - Test that FSDP2LoggingOnlyGradientClipper computes the gradient norm without clipping. - """ - mock_model = MockFSDPModel() - - norm_type = GradientClippingMode.P2_NORM - clipper = FSDP2LoggingOnlyGradientClipper(wrapped_model=mock_model, norm_type=norm_type) - - # Call clip_gradients - norm = clipper.clip_gradients() - - # Verify the norm and that gradients are unchanged - expected_norm = (1**2 + 1**2 + 1**2 + 1**2) ** 0.5 # 2.0 - assert torch.allclose(norm, torch.tensor(expected_norm)), "Norm should match expected total norm" - for param in mock_model.parameters(): - assert torch.allclose(param.grad, torch.tensor([1.0, 1.0])), "Gradients should not be modified" - - -def test_dummy_gradient_clipper(): - """ - Test that DummyGradientClipper returns a tensor with -1.0 and does not affect gradients. - """ - clipper = DummyGradientClipper() - norm = clipper.clip_gradients() - assert torch.allclose(norm, torch.tensor([-1.0])), "Norm should be -1.0 indicating no clipping" diff --git a/tests/training/gradient_clipping/test_fsdp_gradient_clipper.py b/tests/training/gradient_clipping/test_fsdp_gradient_clipper.py index 21a7e4a33..edc797151 100644 --- a/tests/training/gradient_clipping/test_fsdp_gradient_clipper.py +++ b/tests/training/gradient_clipping/test_fsdp_gradient_clipper.py @@ -1,15 +1,161 @@ import tempfile +import types from multiprocessing import Queue +from unittest.mock import MagicMock import torch import torch.distributed as dist import torch.multiprocessing as mp import torch.nn as nn -from modalities.training.gradient_clipping.fsdp_gradient_clipper import FSDP2GradientClipper +from modalities.training.gradient_clipping.fsdp_gradient_clipper import ( + FSDP1GradientClipper, + FSDP1LoggingOnlyGradientClipper, + FSDP2GradientClipper, + FSDP2LoggingOnlyGradientClipper, + GradientClippingMode, +) from tests.utility import find_free_port +class MockFSDPModel: + def __init__(self): + self.param1 = torch.nn.Parameter(torch.tensor([1.0, 2.0])) + self.param2 = torch.nn.Parameter(torch.tensor([3.0, 4.0])) + self.param1.grad = torch.tensor([1.0, 1.0]) + self.param2.grad = torch.tensor([1.0, 1.0]) + + def parameters(self): + return [self.param1, self.param2] + + +# Test for FSDP1 gradient clipper +def test_fsdp1_gradient_clipper(): + """ + Test FSDP1GradientClipper's ability to clip gradients correctly. + Uses a mock model with a dynamically added clip_grad_norm_ method to verify norm calculation and gradient scaling. + """ + mock_model = MockFSDPModel() + max_norm = 1.0 + norm_type = GradientClippingMode.P2_NORM + + # Note: FSDPGradientClipper requires clip_grad_norm_, but user's model lacks it. + # To use FSDPGradientClipper, we’d need to add this method, which deviates from the request. + # For strict adherence, we could skip this test or raise an error, but let’s adapt. + # Temporarily extend MockFSDPModel in this test (with a comment explaining). + def clip_grad_norm_(self, max_norm, norm_type): + params = [p for p in self.parameters() if p.grad is not None] + total_norm = torch.norm(torch.stack([torch.norm(p.grad, norm_type) for p in params]), norm_type) + clip_coef = max_norm / (total_norm + 1e-6) + if clip_coef < 1: + for p in params: + p.grad.data.mul_(clip_coef) + return total_norm + + # Dynamically add the method for this test + mock_model.clip_grad_norm_ = types.MethodType(clip_grad_norm_, mock_model) + + clipper = FSDP1GradientClipper(wrapped_model=mock_model, max_norm=max_norm, norm_type=norm_type) + norm = clipper.clip_gradients() + + # Expected norm before clipping: sqrt(1^2 + 1^2 + 1^2 + 1^2) = 2.0 + expected_norm = torch.tensor(2.0) + assert torch.allclose(norm, expected_norm), f"Expected norm {expected_norm}, got {norm}" + + # Gradients should be scaled to max_norm / total_norm = 1.0 / 2.0 = 0.5 + expected_grad = torch.tensor([0.5, 0.5]) + for param in mock_model.parameters(): + assert torch.allclose(param.grad, expected_grad), f"Expected grad {expected_grad}, got {param.grad}" + + +def test_fsdp1_logging_only_gradient_clipper(): + """ + Test that FSDP1LoggingOnlyGradientClipper calls clip_grad_norm_ with max_norm=torch.inf, + ensuring no clipping occurs, and returns the gradient norm. + """ + # Create a mock FSDP1 model + mock_model = MagicMock() + norm_type = GradientClippingMode.P2_NORM + clipper = FSDP1LoggingOnlyGradientClipper(wrapped_model=mock_model, norm_type=norm_type) + + # Call clip_gradients + clipper.clip_gradients() + + # Verify that clip_grad_norm_ was called with max_norm=torch.inf + mock_model.clip_grad_norm_.assert_called_once_with(max_norm=torch.inf, norm_type=norm_type.value) + + +def test_fsdp2_clip_grad_norm(): + """ + Test the static clip_grad_norm_ method in FSDP2GradientClipper to ensure it correctly + computes the gradient norm and clips gradients when necessary. + """ + # Create parameters with gradients + mock_model = MockFSDPModel() + + # Compute expected total norm (Euclidean norm, norm_type=2) + expected_norm = (1**2 + 1**2 + 1**2 + 1**2) ** 0.5 # sqrt(4) = 2.0 + + # Test case 1: max_norm > total_norm (no clipping) + max_norm = expected_norm + 1 # 3.0 + norm = FSDP2GradientClipper(mock_model, max_norm=max_norm, norm_type=GradientClippingMode.P2_NORM).clip_gradients() + assert torch.allclose(norm, torch.tensor(expected_norm)), "Norm should match expected total norm" + assert torch.allclose(mock_model.param1.grad, torch.tensor([1.0, 1.0])), "Gradients should not be clipped" + assert torch.allclose(mock_model.param2.grad, torch.tensor([1.0, 1.0])), "Gradients should not be clipped" + + # Test case 2: max_norm < total_norm (clipping occurs) + max_norm = expected_norm / 2 # 1.0 + norm = FSDP2GradientClipper(mock_model, max_norm=max_norm, norm_type=GradientClippingMode.P2_NORM).clip_gradients() + assert torch.allclose(norm, torch.tensor(expected_norm)), "Norm should match pre-clipping total norm" + scale = max_norm / expected_norm # 1.0 / 2.0 = 0.5 + expected_grad = torch.tensor([1.0 * scale, 1.0 * scale]) + assert torch.allclose(mock_model.param1.grad, expected_grad), "Gradients should be clipped" + assert torch.allclose(mock_model.param2.grad, expected_grad), "Gradients should be clipped" + + +def test_fsdp2_gradient_clipper(): + """ + Test that FSDP2GradientClipper correctly calls clip_grad_norm_ on the wrapped model's parameters. + """ + # Create a mock FSDP2 model with parameters + + mock_model = MockFSDPModel() + + max_norm = 1.0 + norm_type = GradientClippingMode.P2_NORM + clipper = FSDP2GradientClipper(wrapped_model=mock_model, max_norm=max_norm, norm_type=norm_type) + + # Call clip_gradients + norm = clipper.clip_gradients() + + expected_norm = (1**2 + 1**2 + 1**2 + 1**2) ** 0.5 # 2.0 + assert torch.allclose(norm, torch.tensor(expected_norm)), "Norm should match expected total norm" + + scale = max_norm / expected_norm # 0.5 + expected_grad = torch.tensor([1.0 * scale, 1.0 * scale]) + for param in mock_model.parameters(): + assert torch.allclose(param.grad, expected_grad), "Gradients should be clipped" + + +def test_fsdp2_logging_only_gradient_clipper(): + """ + Test that FSDP2LoggingOnlyGradientClipper computes the gradient norm without clipping. + """ + mock_model = MockFSDPModel() + + norm_type = GradientClippingMode.P2_NORM + clipper = FSDP2LoggingOnlyGradientClipper(wrapped_model=mock_model, norm_type=norm_type) + + # Call clip_gradients + norm = clipper.clip_gradients() + + # Verify the norm and that gradients are unchanged + expected_norm = (1**2 + 1**2 + 1**2 + 1**2) ** 0.5 # 2.0 + assert torch.allclose(norm, torch.tensor(expected_norm)), "Norm should match expected total norm" + for param in mock_model.parameters(): + assert torch.allclose(param.grad, torch.tensor([1.0, 1.0])), "Gradients should not be modified" + + def test_pipeline_parallelized_clipping_equivalent_to_single_stage_clipping(): max_norm = 0.1 # create full model and initialize deterministically @@ -48,14 +194,14 @@ def test_pipeline_parallelized_clipping_equivalent_to_single_stage_clipping(): results[rank] = coll # perform clipping on the full model (single-stage) - FSDP2GradientClipper.clip_grad_norm_( - parameters=full.parameters(), + FSDP2GradientClipper( + wrapped_model=full, max_norm=max_norm, - norm_type=2.0, + norm_type=GradientClippingMode.P2_NORM, + device_mesh=None, error_if_nonfinite=True, foreach=True, - device_mesh=None, - ) + ).clip_gradients() # compare full model parts to the per-stage results full_a_params = [p.data.cpu() for p in full.a.parameters()] @@ -151,14 +297,14 @@ def __getitem__(self, name: str): mesh = DummyDeviceMesh(dist.group.WORLD) # call the clipping function which will perform all_reduce across the pp group - FSDP2GradientClipper.clip_grad_norm_( - parameters=part.parameters(), + FSDP2GradientClipper( + wrapped_model=part, max_norm=max_norm, - norm_type=2.0, + norm_type=GradientClippingMode.P2_NORM, + device_mesh=mesh, error_if_nonfinite=True, foreach=True, - device_mesh=mesh, - ) + ).clip_gradients() # collect clipped parameter tensors (cpu) and serialize to plain Python lists # to avoid multiprocessing shared-storage pickling issues. From c6fee1818c4ac07406c712561cc59a113f6fda78 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Fri, 24 Oct 2025 17:26:06 +0200 Subject: [PATCH 66/67] fix(logging): Correct number of parameters computation in case of pipeline parallelism. --- src/modalities/main.py | 2 +- src/modalities/running_env/fsdp/device_mesh.py | 3 +-- src/modalities/util.py | 14 +++++++++++--- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/modalities/main.py b/src/modalities/main.py index 6624d9f8b..59845376f 100644 --- a/src/modalities/main.py +++ b/src/modalities/main.py @@ -154,7 +154,7 @@ def run(self, components: TrainingComponentsInstantiationModel): loss_fun=components.loss_fn, num_ranks=components.settings.cuda_env.world_size, ) - num_params = get_total_number_of_trainable_parameters(components.app_state.model) + num_params = get_total_number_of_trainable_parameters(components.app_state.model, components.device_mesh) components.evaluation_subscriber.consume_dict({"No. parameters": num_params}) logging.info(f"Training model with {num_params} parameters.") diff --git a/src/modalities/running_env/fsdp/device_mesh.py b/src/modalities/running_env/fsdp/device_mesh.py index e9f1f3e95..3a217c0e3 100644 --- a/src/modalities/running_env/fsdp/device_mesh.py +++ b/src/modalities/running_env/fsdp/device_mesh.py @@ -5,7 +5,6 @@ from torch.distributed.device_mesh import DeviceMesh, init_device_mesh from modalities.exceptions import ConfigError -from modalities.util import print_rank_0 from modalities.utils.logger_utils import get_logger logger = get_logger("model_factory") @@ -124,7 +123,7 @@ def get_device_mesh( names.append(name) names = tuple(names) device_mesh = init_device_mesh(device_type, dims, mesh_dim_names=names) - print_rank_0(f"{device_mesh=} | {world_size=} | {enable_loss_parallel=}") + logger.info(f"{device_mesh=} | {world_size=} | {enable_loss_parallel=}") # TODO: Torch Titan had some more checks here. We need to check if we also need those: # https://github.com/pytorch/torchtitan/blob/b291ad662493b63d25b038a30a915082d3617baf/torchtitan/distributed/parallel_dims.py#L86-L104 return device_mesh diff --git a/src/modalities/util.py b/src/modalities/util.py index 42b1360ac..9f8e749e9 100644 --- a/src/modalities/util.py +++ b/src/modalities/util.py @@ -11,12 +11,14 @@ import torch.distributed as dist import torch.nn as nn from pydantic import ValidationError +from torch.distributed.device_mesh import DeviceMesh from torch.distributed.fsdp import FSDPModule as FSDP2 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP1 from torch.distributed.tensor import DTensor from torch.types import Number from modalities.exceptions import TimeRecorderStateError +from modalities.running_env.fsdp.device_mesh import ParallelismDegrees from modalities.running_env.fsdp.reducer import Reducer from modalities.utils.typing_utils import FSDPX @@ -164,12 +166,13 @@ def get_local_number_of_trainable_parameters(model: nn.Module) -> int: return num_params -def get_total_number_of_trainable_parameters(model: FSDPX) -> Number: +def get_total_number_of_trainable_parameters(model: FSDPX, device_mesh: DeviceMesh | None) -> Number: """Returns the total number of trainable parameters across all ranks. The model must be sharded with FSDP1 or FSDP2. Args: model (FSDPX): The model for which to calculate the number of trainable parameters. + device_mesh (DeviceMesh | None): The device mesh used for distributed training. Returns: Number: The total number of trainable parameters across all ranks. @@ -216,8 +219,13 @@ def get_total_number_of_trainable_parameters(model: FSDPX) -> Number: # >>> parameter_tensor.shape[0] * parameter_tensor.shape[1] # 6438912 - total_num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - return total_num_params + num_params_tensor = sum(p.numel() for p in model.parameters() if p.requires_grad) + if device_mesh is not None and ParallelismDegrees.PP.value in device_mesh.mesh_dim_names: + num_params_tensor = torch.tensor(num_params_tensor).cuda() + pp_mesh = device_mesh[ParallelismDegrees.PP.value] + dist.all_reduce(num_params_tensor, op=dist.ReduceOp.SUM, group=pp_mesh.get_group()) + return num_params_tensor.item() + return num_params_tensor else: raise ValueError( f"Model type {type(model)} is not supported. " From 29acaf4799697aca88ee71f7cc8e3119c6708ca1 Mon Sep 17 00:00:00 2001 From: Timm Ruland Date: Fri, 24 Oct 2025 17:33:44 +0200 Subject: [PATCH 67/67] revert: Removed clarified todo. No adaptions needed since fsdp1 is not compatible with PP and TP. --- src/modalities/util.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/modalities/util.py b/src/modalities/util.py index 9f8e749e9..7c479da1d 100644 --- a/src/modalities/util.py +++ b/src/modalities/util.py @@ -189,11 +189,9 @@ def get_total_number_of_trainable_parameters(model: FSDPX, device_mesh: DeviceMe # However, users can also provide their own sharding process groups (currently not supported in Modalities) # which would require to adapt the code. if model.sharding_strategy.name == "NO_SHARD": - sharding_factor = dist.get_world_size() # TODO Check if we should use number of data parallel ranks instead + sharding_factor = dist.get_world_size() if model.sharding_strategy.name == "HYBRID_SHARD": - sharding_factor = ( - dist.get_world_size() // torch.cuda.device_count() - ) # TODO Check if we should use number of data parallel ranks instead + sharding_factor = dist.get_world_size() // torch.cuda.device_count() elif model.sharding_strategy.name == "FULL_SHARD": sharding_factor = 1 total_num_params = total_num_params // sharding_factor