[VLM] Add token-imbalance loss (pytorch#1803)

lkhphuc · githubsgi · commit 638da2eba20b · 2025-10-13T13:01:11.000-07:00
In VLM interleaved training, with native resolution and aspect ratio, the number of tokens participating in loss computation differ per rank. Naive FSDP gradient averaging across data ranks can causes tokens on ranks with fewer valid tokens to contribute more to the loss than on other ranks. This PR address this via loss balancing, which incur an additional comm in the loss computation. In practice, I haven't notice any impacts from this comm. #### Quick sanity check Let have a sum loss of all tokens on each rank i, with $N_i$ number of tokens $L_i = \sum_{j=1}^{N_i}\ell_{ij}$ and its gradient $g_i = \sum_{j=1}^{N_i}\nabla\ell_{ij}$ If we multiply the *loss* on each rank by a constant factor **c** (the same for all ranks), then after `backward()`: $$ \tilde g_i = c \cdot g_i . $$ FSDP will *average* these gradients across ranks: $$ g_{\text{FSDP}}=\frac{1}{R}\sum_{i=1}^{R} \tilde g_i =\frac{c}{R}\sum_{i=1}^{R} g_i . $$ We want this to equal the **global‑sample average**: $$ g_{\text{true}} =\frac{1}{N_{\text{total}}}\sum_{i=1}^{R}\sum_{j=1}^{N_i}\nabla \ell_{ij} =\frac{1}{N_{\text{total}}}\sum_{i=1}^{R} g_i . $$ Thus for FSDP gradient to be correct, we need $$ \frac{c}{R}= \frac{1}{N_{\text{total}}}\quad\Longrightarrow\quad c=\frac{R}{N_{\text{total}}}. $$ So the *right* scaling factor is $R/N_{\text{total}}$, which mean divide the per-rank sum loss with $N_{\text{total}}/R$, which is **average number of tokens per rank**. Intuitively, this is the same as default cross-entropy loss, but instead of diving sum loss on a rank by the number of tokens **on that rank**, we now divide by the **average number of tokens across all rank** P/s: sorry this PR is based on pytorch#1802 but I couldn't choose that as the base branch. Maybe it will be easier to review once that PR is merged.
diff --git a/torchtitan/components/loss.py b/torchtitan/components/loss.py
@@ -23,7 +23,8 @@ def cross_entropy_loss(pred: torch.Tensor, labels: torch.Tensor) -> torch.Tensor
     )
 
 
-def build_cross_entropy_loss(job_config: JobConfig):
+def build_cross_entropy_loss(job_config: JobConfig, **kwargs):
+    del kwargs  # delete any unused arguments
     loss_fn = cross_entropy_loss
     if job_config.compile.enable and "loss" in job_config.compile.components:
         logger.info("Compiling the loss function with torch.compile")
diff --git a/torchtitan/experiments/vlm/__init__.py b/torchtitan/experiments/vlm/__init__.py
@@ -6,7 +6,6 @@
 
 from dataclasses import asdict, replace
 
-from torchtitan.components.loss import build_cross_entropy_loss
 from torchtitan.components.lr_scheduler import build_lr_schedulers
 from torchtitan.components.optimizer import build_optimizers
 from torchtitan.components.tokenizer import build_hf_tokenizer
diff --git a/torchtitan/experiments/vlm/infra/loss.py b/torchtitan/experiments/vlm/infra/loss.py
@@ -0,0 +1,113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import partial
+
+import torch
+import torch.distributed._functional_collectives as funcol
+import torch.distributed.distributed_c10d as c10d
+from torch import distributed as dist
+from torch.distributed.device_mesh import DeviceMesh
+
+from torchtitan.components.ft.manager import FTManager
+from torchtitan.config.job_config import JobConfig
+from torchtitan.distributed.parallel_dims import ParallelDims
+from torchtitan.tools.logging import logger
+
+
+IGNORE_INDEX = -100  # Pytorch's default for F.cross_entropy
+
+
+# WARNING: currently this does not take into account gradient accumulation
+# and the gradient can still be biased toward grad accum step with less valid tokens
+# See: https://github.com/pytorch/torchtitan/issues/1842
+def token_imbalance_ce_loss(
+    pred: torch.Tensor,
+    labels: torch.Tensor,
+    token_mesh: DeviceMesh,
+    ft_pg: dist.ProcessGroup | None,
+) -> torch.Tensor:
+    """
+    Cross‑entropy loss that is *robust* to varying numbers of valid tokens across ranks.
+
+    In a typical distributed training setup (data parallel + sequence parallel),
+    each rank computes the loss over **only its local tokens** and returns an
+    *average* over those tokens:
+
+    Afterwards, when Fully‑Sharded Data Parallel (FSDP) averages the gradients
+    across all ranks, the resulting update is equivalent to a **global sample
+    average** *only if every rank contains the same number of tokens*.
+    In practice that assumption is violated for many workloads:
+    - Sequences are padded to a fixed length -> some ranks see fewer real tokens.
+    - SFT finetuning where user's queries tokens are masked out.
+    - Vision encoders often injects a large number of “ignored”
+      tokens as context that are not trained with text tokens' loss.
+
+    This function fixes the issue by **scaling the sum-of-loss** with the *average*
+    number of non‑ignored tokens per rank, computed via an all-reduce over
+    `token_mesh`.  The returned scalar therefore represents the loss that would
+    be obtained if every token in the entire distributed batch contributed with
+    equal weight to the global gradient, regardless of how many padded or
+    ignored tokens each rank contains.
+
+    Parameters
+    ----------
+    pred : torch.Tensor
+    labels : torch.Tensor
+    token_mesh : DeviceMesh
+        A device mesh that contains all ranks participating in this training step's
+        loss computation.  The function performs an ``all_reduce`` (mean) over the
+        `num_tokens` tensor of a rank across this mesh.
+    ft_pg: dist.ProcessGroup | None
+        Optional pg for Fault Tolerance training.
+
+    Returns
+    -------
+    torch.Tensor
+        A scalar loss tensor,  ready for ``backward()`` and FSDP all-reduce mean
+
+    Notes
+    -----
+    * The function internally uses :func:`torch.nn.functional.cross_entropy`
+      with ``reduction="sum"`` so that each token contributes exactly once to
+      the numerator.  The denominator is the **average** number of valid tokens
+      per rank, not the local count.
+    * If a rank contains no valid tokens (i.e., all labels are ``IGNORE_INDEX``),
+      its contribution to the sum is zero and its `num_tokens` becomes zero.
+      In that case the mean across ranks will still be well‑defined as long as
+      at least one rank has non‑zero token count.
+    """
+    sum_loss = torch.nn.functional.cross_entropy(
+        pred.flatten(0, 1).float(),
+        labels.flatten(0, 1),
+        reduction="sum",
+        ignore_index=IGNORE_INDEX,
+    )
+    num_tokens = (labels != IGNORE_INDEX).sum()
+    avg_num_tokens_per_rank = funcol.all_reduce(
+        num_tokens, reduceOp=c10d.ReduceOp.AVG.name, group=token_mesh
+    )
+    if ft_pg is not None:
+        avg_num_tokens_per_rank = funcol.all_reduce(
+            avg_num_tokens_per_rank, reduceOp=c10d.ReduceOp.AVG.name, group=ft_pg
+        )
+    return sum_loss / avg_num_tokens_per_rank
+
+
+def build_token_imbalance_ce_loss(
+    job_config: JobConfig, parallel_dims: ParallelDims, ft_manager: FTManager, **kwargs
+):
+    del kwargs  # delete any unused arguments
+    # NOTE: The device mesh where the input tokens w/ shape BSD can be sliced:
+    # DP split the batch dim B
+    # CP split the sequence dim S
+    token_mesh = parallel_dims.world_mesh["dp_cp"]
+    ft_pg = ft_manager.loss_sync_pg
+    loss_fn = partial(token_imbalance_ce_loss, token_mesh=token_mesh, ft_pg=ft_pg)
+    if job_config.compile.enable and "loss" in job_config.compile.components:
+        logger.info("Compiling the loss function with torch.compile")
+        loss_fn = torch.compile(loss_fn, backend=job_config.compile.backend)
+    return loss_fn
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -203,7 +203,9 @@ def __init__(self, job_config: JobConfig):
             init_device = device_type
             buffer_device = None
 
-        self.loss_fn = self.train_spec.build_loss_fn(job_config)
+        self.loss_fn = self.train_spec.build_loss_fn(
+            job_config, parallel_dims=parallel_dims, ft_manager=self.ft_manager
+        )
 
         # verify batch sizes
         global_batch_size = job_config.training.global_batch_size

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,8 @@ def cross_entropy_loss(pred: torch.Tensor, labels: torch.Tensor) -> torch.Tensor`
`23`	`23`	`)`
`24`	`24`
`25`	`25`
`26`		`-def build_cross_entropy_loss(job_config: JobConfig):`
	`26`	`+def build_cross_entropy_loss(job_config: JobConfig, **kwargs):`
	`27`	`+ del kwargs # delete any unused arguments`
`27`	`28`	`loss_fn = cross_entropy_loss`
`28`	`29`	`if job_config.compile.enable and "loss" in job_config.compile.components:`
`29`	`30`	`logger.info("Compiling the loss function with torch.compile")`