[RFC] Lift freqs_cis as an input of models

fegin · fegin · commit cdadbf8ff423 · 2025-10-08T23:53:57.000-07:00
freqs_cis is sensitive to the sequence order. CP load balancing will shuffle the samples, so each batch will have different orders. As a result, we will have to lift these order senstive buffer to the inputs and broadcast them along the batch dimension so that PP will correctly shard freqs_cis without messing up the correctness. ghstack-source-id: 2d88844 Pull-Request-resolved: #1797
diff --git a/torchtitan/models/llama3/model/model.py b/torchtitan/models/llama3/model/model.py
@@ -92,8 +92,7 @@ def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor) -> torch.Ten
     This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
     for the purpose of broadcasting the frequency tensor during element-wise operations.
 
-    The input freqs_cis tensor is assumed to be of shape (max_seqlen, dim),
-    and the first seqlen elements will be sliced, but dim must match x.
+    The input freqs_cis tensor is assumed to be of shape (batch_size, seqlen, dim).
 
     Args:
         freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
@@ -104,10 +103,10 @@ def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor) -> torch.Ten
     """
     ndim = x.ndim
     assert ndim > 1
+    batch_size = x.shape[0]
     seqlen = x.shape[1]
-    freqs_cis = freqs_cis[0:seqlen]
-    assert freqs_cis.shape == (seqlen, x.shape[-1])
-    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    assert freqs_cis.shape == (batch_size, seqlen, x.shape[-1])
+    shape = [d if i in (0, 1, ndim - 1) else 1 for i, d in enumerate(x.shape)]
     return freqs_cis.view(*shape)
 
 
@@ -472,9 +471,18 @@ def get_attention_masks(
             and_masks(*mask_mods), B, None, input_batch.shape[1], input_batch.shape[1]
         )
 
+    def get_order_sensitive_buffers(
+        self,
+        batch_size: int,
+        seq_len: int,
+    ) -> tuple[tuple[torch.Tensor, ...], tuple[int, ...]]:
+        freqs_cis = self.freqs_cis[:seq_len].repeat(batch_size, 1, 1)
+        return ((freqs_cis,), (1,))
+
     def forward(
         self,
         tokens: torch.Tensor,
+        freqs_cis: torch.Tensor,
         attention_masks: AttentionMasksType | None = None,
         input_batch: torch.Tensor | None = None,
     ):
@@ -499,7 +507,7 @@ def forward(
         h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
 
         for layer in self.layers.values():
-            h = layer(h, self.freqs_cis, attention_masks=attention_masks)
+            h = layer(h, freqs_cis, attention_masks=attention_masks)
 
         h = self.norm(h) if self.norm else h
         output = self.output(h) if self.output else h
diff --git a/torchtitan/protocols/model.py b/torchtitan/protocols/model.py
@@ -70,3 +70,11 @@ def get_attention_masks(
         raise NotImplementedError(
             "This model does not support attention masking/Flex Attention."
         )
+
+    def get_order_sensitive_buffers(
+        self,
+        batch_size: int,
+        seq_len: int,
+    ) -> tuple[tuple[torch.Tensor, ...], tuple[int, ...]]:
+        raise NotImplementedError()
+        return ((), ())
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -421,6 +421,11 @@ def forward_backward_step(
                 extra_inputs=extra_inputs,
             )
 
+        # Get the order sensitive buffers
+        order_sensitive_buffers = model_parts[0].get_order_sensitive_buffers(
+            inputs.size(0), inputs.size(1)
+        )
+
         # apply context parallelism if cp is enabled
         # ensure CP handles the separate freqs_cis buffer for each pp stage
         cp_mesh = parallel_dims.world_mesh["cp"] if parallel_dims.cp_enabled else None
@@ -445,13 +450,15 @@ def forward_backward_step(
                 if self.pp_has_first_stage:
                     self.pp_schedule.step(
                         inputs,
+                        *order_sensitive_buffers[0],
                         **extra_inputs,
                         target=targets,
                         losses=losses,
                         input_batch=inputs,
                     )
                 else:
                     self.pp_schedule.step(
+                        *order_sensitive_buffers[0],
                         target=targets,
                         losses=losses,
                         input_batch=inputs,
@@ -472,7 +479,11 @@ def forward_backward_step(
             with self.train_context(optional_context_parallel_ctx):
                 assert len(model_parts) == 1
                 with self.maybe_enable_amp:
-                    pred = model_parts[0](inputs, **extra_inputs)
+                    pred = model_parts[0](
+                        inputs,
+                        *order_sensitive_buffers[0],
+                        **extra_inputs,
+                    )
                     loss = self.loss_fn(pred, labels)
                 # need to free pred before bwd to avoid peaking memory
                 del pred