Modalities · BlueCrescent · Aug 6, 2025 · Aug 7, 2025 · Aug 15, 2025 · Aug 15, 2025
diff --git a/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml b/config_files/training/config_lorem_ipsum_long_fsdp2_pp.yaml
diff --git a/config_files/training/config_lorem_ipsum_long_fsdp2_pp_tp.yaml b/config_files/training/config_lorem_ipsum_long_fsdp2_pp_tp.yaml
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,12 +1,10 @@
 [project]
 name = "modalities"
 version = "0.3.2"
-requires-python = ">=3.10,<3.12"
 description = "Modalities, a PyTorch-native framework for distributed and reproducible foundation model training."
 readme = "README.md"
 dependencies = [
     "numpy<2.0",
-    "torch==2.6.0",
     "packaging",
     "tqdm",
     "pyyaml",

diff --git a/src/modalities/config/instantiation_models.py b/src/modalities/config/instantiation_models.py
@@ -8,11 +8,13 @@
     PydanticAppStateType,
     PydanticCheckpointSavingIFType,
     PydanticDatasetIFType,
+    PydanticDeviceMeshIFType,
     PydanticGradientClipperIFType,
     PydanticLLMDataLoaderIFType,
     PydanticLossIFType,
     PydanticMessageSubscriberIFType,
     PydanticMFUCalculatorABCType,
+    PydanticPipelineType,
     PydanticPytorchDeviceType,
     PydanticPytorchModuleType,
     PydanticTextInferenceComponentType,
@@ -178,6 +180,8 @@ def _check_last_step_checkpointed(self) -> "TrainingComponentsInstantiationModel
     checkpoint_saving: PydanticCheckpointSavingIFType
     gradient_clipper: PydanticGradientClipperIFType
     mfu_calculator: Optional[PydanticMFUCalculatorABCType] = None
+    scheduled_pipeline: Optional[PydanticPipelineType] = None
+    device_mesh: Optional[PydanticDeviceMeshIFType] = None
     model_raw: PydanticPytorchModuleType
 
     @model_validator(mode="after")

diff --git a/src/modalities/config/pydantic_if_types.py b/src/modalities/config/pydantic_if_types.py
@@ -7,6 +7,7 @@
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.fsdp import FSDPModule as FSDP2
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP1
+from torch.distributed.pipelining import PipelineStage
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
 from torch.utils.data import Sampler
@@ -21,6 +22,7 @@
 from modalities.inference.text.inference_component import TextInferenceComponent
 from modalities.logging_broker.subscriber import MessageSubscriberIF
 from modalities.loss_functions import Loss
+from modalities.models.parallelism.pipeline_parallelism import Pipeline, StagesGenerator
 from modalities.nn.model_initialization.initialization_if import ModelInitializationIF
 from modalities.tokenization.tokenizer_wrapper import TokenizerWrapper
 from modalities.training.gradient_clipping.gradient_clipper import GradientClipperIF
@@ -83,3 +85,6 @@ def __get_pydantic_core_schema__(
 PydanticDatasetBatchGeneratorIFType = Annotated[
     DatasetBatchGeneratorIF, PydanticThirdPartyTypeIF(DatasetBatchGeneratorIF)
 ]
+PydanticStagesGeneratorType = Annotated[StagesGenerator, PydanticThirdPartyTypeIF(StagesGenerator)]
+PydanticPipelineType = Annotated[Pipeline, PydanticThirdPartyTypeIF(Pipeline)]
+PydanticPipelineStageType = Annotated[PipelineStage, PydanticThirdPartyTypeIF(PipelineStage)]
diff --git a/src/modalities/conversion/gpt2/conversion_model.py b/src/modalities/conversion/gpt2/conversion_model.py
@@ -136,10 +136,10 @@ def _copy_weights_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
         modalities_model (GPT2LLM): The modalities model from which the weights will be copied.
     """
     hf_model.model.embed_tokens.weight.data.copy_(modalities_model.transformer.wte.weight.data)
-    for hf_layer, modalities_layer in zip(hf_model.model.layers, modalities_model.transformer.h):
-        _copy_weights_attention(hf_layer, modalities_layer)
-        _copy_weights_mlp(hf_layer, modalities_layer)
-        _copy_weights_layer_norms(hf_layer, modalities_layer)
+    for hf_layer, modalities_layer_idx in zip(hf_model.model.layers, modalities_model.transformer.h):
+        _copy_weights_attention(hf_layer, modalities_model.transformer.h[modalities_layer_idx])
+        _copy_weights_mlp(hf_layer, modalities_model.transformer.h[modalities_layer_idx])
+        _copy_weights_layer_norms(hf_layer, modalities_model.transformer.h[modalities_layer_idx])
     _copy_weights_base_modules(hf_model.lm_head, modalities_model.transformer.lm_head)
     _copy_weights_base_modules(hf_model.model.norm, modalities_model.transformer.lm_head_norm)
 

diff --git a/src/modalities/evaluator.py b/src/modalities/evaluator.py
@@ -9,6 +9,7 @@
 from modalities.logging_broker.messages import ExperimentStatus, MessageTypes, ProgressUpdate
 from modalities.logging_broker.publisher import MessagePublisher
 from modalities.models.model import model_predict_batch
+from modalities.models.parallelism.pipeline_parallelism import Pipeline
 from modalities.running_env.fsdp.reducer import Reducer
 from modalities.trainer import ThroughputAggregationKeys
 from modalities.util import Aggregator, TimeRecorder
@@ -36,20 +37,42 @@ def evaluate_batch(
         batch: DatasetBatch,
         model: nn.Module,
         loss_fun: Callable[[InferenceResultBatch], torch.Tensor],
-    ) -> torch.Tensor:
+        scheduled_pipeline: Pipeline | None = None,
+    ) -> torch.Tensor | None:
         """Evaluate a single batch by forwarding it through the model and calculating the loss.
 
         Args:
             batch (DatasetBatch): The batch to evaluate
             model (nn.Module): The model to evaluate
             loss_fun (Callable[[InferenceResultBatch], torch.Tensor]): The loss function to calculate the loss
+            scheduled_pipeline (Pipeline | None, optional): In case of pipeline parallelism, this is used to
+                operate the model. Defaults to None.
 
         Returns:
-            torch.Tensor: The loss of the batch
+            torch.Tensor | None: The loss of the batch
+                None, if a non-last stage was processed in pipeline parallelism
         """
         with torch.no_grad():
-            result_batch = model_predict_batch(model=model, batch=batch)
-        loss = loss_fun(result_batch)
+            if scheduled_pipeline is not None:
+                pp_schedule = scheduled_pipeline.pp_schedule
+                targets, losses = (
+                    (batch.targets[loss_fun.target_key].contiguous(), [])
+                    if scheduled_pipeline.is_last_pp_stage
+                    else (None, None)
+                )
+
+                if scheduled_pipeline.is_first_pp_stage:
+                    pp_schedule.eval(batch.samples[model.sample_key].contiguous(), target=targets, losses=losses)
+                else:
+                    pp_schedule.eval(target=targets, losses=losses)
+                loss = (
+                    torch.mean(torch.stack(losses)).to(losses[0].device)
+                    if scheduled_pipeline.is_last_pp_stage
+                    else None
+                )
+            else:
+                result_batch = model_predict_batch(model=model, batch=batch)
+                loss = loss_fun(result_batch)
         return loss
 
     def evaluate(
@@ -58,6 +81,7 @@ def evaluate(
         data_loaders: list[LLMDataLoader],
         loss_fun: Callable[[InferenceResultBatch], torch.Tensor],
         num_train_steps_done: int,
+        scheduled_pipeline: Pipeline | None = None,
     ) -> dict[str, EvaluationResultBatch]:
         """Evaluate the model on a set of datasets.
 
@@ -66,6 +90,8 @@ def evaluate(
             data_loaders (list[LLMDataLoader]): List of dataloaders to evaluate the model on
             loss_fun (Callable[[InferenceResultBatch], torch.Tensor]): The loss function to calculate the loss
             num_train_steps_done (int): The number of training steps done so far for logging purposes
+            scheduled_pipeline (Pipeline | None, optional): In case of pipeline parallelism, this is used to
+                operate the model. Defaults to None.
 
         Returns:
             dict[str, EvaluationResultBatch]: A dictionary containing the evaluation results for each dataloader
@@ -90,10 +116,13 @@ def evaluate(
                         batch=batch,
                         model=model,
                         loss_fun=loss_fun,
+                        scheduled_pipeline=scheduled_pipeline,
                     )
 
-                    cumulated_loss[0] += batch_loss.item()  # sum up batch loss
-                    cumulated_loss[1] += 1
+                    # The batch_loss might be None if we use pipeline parallelism and are not the last stage.
+                    if batch_loss is not None:
+                        cumulated_loss[0] += batch_loss.item()  # sum up batch loss
+                        cumulated_loss[1] += 1
                     batch_length_tensor = torch.tensor(len(batch)).to(device)
                     thoughput_aggregator.add_value(key=ThroughputAggregationKeys.NUM_SAMPLES, value=batch_length_tensor)
 

diff --git a/src/modalities/gym.py b/src/modalities/gym.py
@@ -9,6 +9,7 @@
 from modalities.dataloader.dataloader import LLMDataLoader
 from modalities.evaluator import Evaluator
 from modalities.loss_functions import Loss
+from modalities.models.parallelism.pipeline_parallelism import Pipeline
 from modalities.trainer import Trainer
 from modalities.training.training_progress import TrainingProgress
 from modalities.util import print_rank_0
@@ -40,6 +41,7 @@ def run(
         train_data_loader: LLMDataLoader,
         evaluation_data_loaders: list[LLMDataLoader],
         checkpoint_saving: CheckpointSaving,
+        scheduled_pipeline: Pipeline | None = None,
     ):
         """Runs the model training, including evaluation and checkpointing.
 
@@ -51,12 +53,15 @@ def run(
             train_data_loader (LLMDataLoader): Data loader with the training data.
             evaluation_data_loaders (list[LLMDataLoader]): List of data loaders with the evaluation data.
             checkpoint_saving (CheckpointSaving): Routine for saving checkpoints.
+            scheduled_pipeline (Pipeline | None, optional): In case of pipeline parallelism, this is used to
+                operate the model. Defaults to None.
         """
         evaluation_callback: Callable[[int], None] = partial(
             self._run_evaluation,
             model=app_state.model,
             evaluation_data_loaders=evaluation_data_loaders,
             evaluation_interval_in_steps=evaluation_interval_in_steps,
+            scheduled_pipeline=scheduled_pipeline,
         )
 
         checkpointing_callback: Callable[[TrainingProgress], None] = partial(
@@ -74,6 +79,7 @@ def run(
             evaluation_callback=evaluation_callback,
             checkpointing_callback=checkpointing_callback,
             training_log_interval_in_steps=training_log_interval_in_steps,
+            scheduled_pipeline=scheduled_pipeline,
         )
         print_rank_0(f"Training done at {datetime.now()}.")
 
@@ -101,11 +107,13 @@ def _run_evaluation(
         num_train_steps_done: int,
         evaluation_data_loaders: list[LLMDataLoader],
         evaluation_interval_in_steps: int,
+        scheduled_pipeline: Pipeline | None = None,
     ):
-        if num_train_steps_done % evaluation_interval_in_steps == 0:
+        if num_train_steps_done > 0 and num_train_steps_done % evaluation_interval_in_steps == 0:
             self.evaluator.evaluate(
                 model=model,
                 data_loaders=evaluation_data_loaders,
                 loss_fun=self.loss_fun,
                 num_train_steps_done=num_train_steps_done,
+                scheduled_pipeline=scheduled_pipeline,
             )
diff --git a/src/modalities/loss_functions.py b/src/modalities/loss_functions.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from typing import overload
 
 import torch
 from torch.nn import CrossEntropyLoss
@@ -31,9 +32,16 @@ def __init__(self, target_key: str, prediction_key: str, tag: str = "CLMCrossEnt
         # Mean over the tokens in the local-batch (batch per rank)
         self.loss_fun = CrossEntropyLoss(reduction="mean")
 
+    @overload
     def __call__(self, forward_batch: InferenceResultBatch) -> torch.Tensor:
-        labels = forward_batch.get_targets(self.target_key)
-        lm_logits = forward_batch.get_predictions(self.prediction_key)
+        ...
+
+    @overload
+    def __call__(self, outputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        ...
+
+    def __call__(self, *args, **kwargs) -> torch.Tensor:
+        labels, lm_logits = self._parse_arguments(args, kwargs)
 
         # move labels to correct device to enable model parallelism
         labels = labels.to(lm_logits.device)
@@ -43,6 +51,41 @@ def __call__(self, forward_batch: InferenceResultBatch) -> torch.Tensor:
         loss = self.loss_fun(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
         return loss
 
+    def _parse_arguments(
+        self,
+        args: list[torch.Tensor] | list[InferenceResultBatch],
+        kwargs: dict[str, torch.Tensor] | dict[str, InferenceResultBatch],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if len(args) == 1 and isinstance(args[0], InferenceResultBatch):
+            forward_batch = args[0]
+            labels = forward_batch.get_targets(self.target_key)
+            lm_logits = forward_batch.get_predictions(self.prediction_key)
+        elif "forward_batch" in kwargs and isinstance(kwargs["forward_batch"], InferenceResultBatch):
+            forward_batch = kwargs["forward_batch"]
+            labels = forward_batch.get_targets(self.target_key)
+            lm_logits = forward_batch.get_predictions(self.prediction_key)
+        elif len(args) == 2 and all(isinstance(arg, torch.Tensor) for arg in args):
+            lm_logits, labels = args
+        elif (
+            "outputs" in kwargs
+            and "targets" in kwargs
+            and isinstance(kwargs["outputs"], torch.Tensor)
+            and isinstance(kwargs["targets"], torch.Tensor)
+        ):
+            lm_logits = kwargs["outputs"]
+            labels = kwargs["targets"]
+        elif (
+            len(args) == 1
+            and "targets" in kwargs
+            and isinstance(args[0], torch.Tensor)
+            and isinstance(kwargs["targets"], torch.Tensor)
+        ):
+            lm_logits = args[0]
+            labels = kwargs["targets"]
+        else:
+            raise TypeError("Invalid arguments for CLMCrossEntropyLoss.__call__")
+        return labels, lm_logits
+
 
 def nce_loss(
     embedding1: torch.Tensor, embedding2: torch.Tensor, device: torch.device, is_asymmetric: bool, temperature: float

diff --git a/src/modalities/main.py b/src/modalities/main.py
@@ -20,6 +20,7 @@
 from modalities.logging_broker.subscriber import MessageSubscriberIF
 from modalities.registry.components import COMPONENTS
 from modalities.registry.registry import Registry
+from modalities.running_env.fsdp.device_mesh import ParallelismDegrees, get_num_parallel_ranks
 from modalities.trainer import Trainer
 from modalities.util import get_synced_experiment_id_of_run, get_total_number_of_trainable_parameters, print_rank_0
 
@@ -110,11 +111,20 @@ def run(self, components: TrainingComponentsInstantiationModel):
         )
 
         # Trainer
+        # FIXME replace by get_parallel_degree
+        if components.device_mesh is None:
+            num_pipeline_parallel_ranks = 1
+            num_data_parallel_ranks = 1
+        else:
+            num_pipeline_parallel_ranks = get_num_parallel_ranks(components.device_mesh, ParallelismDegrees.PP)
+            num_data_parallel_ranks = get_num_parallel_ranks(
+                components.device_mesh, ParallelismDegrees.DP_SHARD
+            ) * get_num_parallel_ranks(components.device_mesh, ParallelismDegrees.DP_REPLICATE)
         global_num_tokens_per_train_step = (
             components.settings.step_profile.local_train_micro_batch_size
             * components.settings.step_profile.sequence_length
             * components.settings.step_profile.gradient_accumulation_steps
-            * components.settings.cuda_env.world_size
+            * num_data_parallel_ranks
         )
         trainer = Trainer(
             global_rank=components.settings.cuda_env.global_rank,
@@ -128,6 +138,7 @@ def run(self, components: TrainingComponentsInstantiationModel):
             gradient_clipper=components.gradient_clipper,
             global_num_tokens_per_train_step=global_num_tokens_per_train_step,
             mfu_calculator=components.mfu_calculator,
+            num_pipeline_parallel_ranks=num_pipeline_parallel_ranks,
         )
 
         # Evaluator
@@ -143,7 +154,7 @@ def run(self, components: TrainingComponentsInstantiationModel):
             loss_fun=components.loss_fn,
             num_ranks=components.settings.cuda_env.world_size,
         )
-        num_params = get_total_number_of_trainable_parameters(components.app_state.model)
+        num_params = get_total_number_of_trainable_parameters(components.app_state.model, components.device_mesh)
         components.evaluation_subscriber.consume_dict({"No. parameters": num_params})
         logging.info(f"Training model with {num_params} parameters.")
 
@@ -169,6 +180,7 @@ def run(self, components: TrainingComponentsInstantiationModel):
             checkpointing_interval_in_steps=components.settings.intervals.checkpointing_interval_in_steps,
             evaluation_interval_in_steps=components.settings.intervals.evaluation_interval_in_steps,
             training_log_interval_in_steps=components.settings.intervals.training_log_interval_in_steps,
+            scheduled_pipeline=components.scheduled_pipeline if components.scheduled_pipeline else None,
         )
 
     def get_logging_publishers(