Merge branch 'main' into clairlee/dev/hybrid

clairesonglee · web-flow · commit eb403389f273 · 2026-02-05T14:40:44.000-08:00
diff --git a/primus/backends/megatron/patches/checkpoint_patches.py b/primus/backends/megatron/patches/checkpoint_patches.py
@@ -46,3 +46,79 @@ def patch_filesystem_writer_async(ctx: PatchContext):
     log_rank_0(
         "[Patch:megatron.checkpoint.filesystem_writer_async] Patch FileSystemWriterAsync successfully."
     )
+
+
+@register_patch(
+    "megatron.checkpoint.save_checkpoint",
+    backend="megatron",
+    phase="before_train",
+    description="Wrap save_checkpoint to skip saving at the last iteration",
+)
+def patch_save_checkpoint(ctx: PatchContext):
+    """
+    Wrap Megatron's save_checkpoint to skip saving at the last iteration
+
+    This patch monkey-patches the save_checkpoint function in
+    megatron.training.training module to check if:
+    1. disable_last_saving is True
+    2. Current iteration equals train_iters (final iteration)
+
+    If both conditions are met, the checkpoint save is skipped.
+    """
+    try:
+        import megatron.training.training as training_module
+    except ImportError as e:
+        log_rank_0(f"[Patch:megatron.checkpoint.save_checkpoint] Skip patch (Megatron not available): {e}")
+        return
+
+    # Save original function
+    original_save_checkpoint = training_module.save_checkpoint
+
+    # The following signature is used to match the original Megatron save_checkpoint interface,
+    # but the wrapper will only use a subset of the arguments as handled below.
+    def wrapped_save_checkpoint(
+        iteration,
+        model,
+        optimizer,
+        opt_param_scheduler,
+        num_floating_point_operations_so_far,
+        checkpointing_context=None,
+        pipeline_rank=None,
+        expert_rank=None,
+        tensor_rank=None,
+        pipeline_parallel=None,
+        expert_parallel=None,
+        non_persistent_ckpt=False,
+        train_data_iterator=None,
+        preprocess_common_state_dict_fn=None,
+        release=False,
+    ):
+        args = ctx.extra.get("backend_args", {})
+
+        if args.disable_last_saving and iteration == args.train_iters:
+            log_rank_0(
+                f"[Patch:megatron.checkpoint.save_checkpoint] Skip saving at the last iteration: {iteration}"
+            )
+            return
+
+        # Call the original save_checkpoint function with explicit keyword arguments for clarity.
+        return original_save_checkpoint(
+            iteration,
+            model,
+            optimizer,
+            opt_param_scheduler,
+            num_floating_point_operations_so_far,
+            checkpointing_context=checkpointing_context,
+            pipeline_rank=pipeline_rank,
+            expert_rank=expert_rank,
+            tensor_rank=tensor_rank,
+            pipeline_parallel=pipeline_parallel,
+            expert_parallel=expert_parallel,
+            non_persistent_ckpt=non_persistent_ckpt,
+            train_data_iterator=train_data_iterator,
+            preprocess_common_state_dict_fn=preprocess_common_state_dict_fn,
+            release=release,
+        )
+
+    training_module.save_checkpoint = wrapped_save_checkpoint
+    log_rank_0("[Patch:megatron.checkpoint.save_checkpoint] Patch save_checkpoint successfully.")
diff --git a/primus/backends/megatron/patches/tokenizer_builder_patches.py b/primus/backends/megatron/patches/tokenizer_builder_patches.py
@@ -0,0 +1,93 @@
+###############################################################################
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+
+"""
+Megatron Tokenizer Builder Patches
+
+Override Megatron's build_tokenizer to use Primus version which properly
+handles custom tokenizer types (Llama2Tokenizer, Llama3Tokenizer, etc.)
+with HuggingFace Hub ID support.
+
+Background:
+-----------
+Megatron's official _Llama2Tokenizer only supports local SentencePiece files,
+while Primus extends it to support HuggingFace Hub IDs (e.g., meta-llama/Llama-2-7b-hf).
+
+Without this patch, the new architecture (PrimusRuntime) would call Megatron's
+official build_tokenizer, causing failures when using custom tokenizer types
+with Hub IDs.
+
+This patch ensures both legacy and new architectures use the same tokenizer
+building logic.
+"""
+
+from primus.core.patches import PatchContext, register_patch
+from primus.modules.module_utils import log_rank_0
+
+
+@register_patch(
+    "megatron.tokenizer.build_tokenizer_override",
+    backend="megatron",
+    phase="setup",
+    description="Override Megatron's build_tokenizer to support Primus custom tokenizer types with HuggingFace Hub IDs",
+)
+def patch_build_tokenizer_override(ctx: PatchContext):
+    """
+    Monkey-patch Megatron's build_tokenizer with Primus version.
+
+    This ensures that custom tokenizer types (Llama2Tokenizer, Llama3Tokenizer,
+    DeepSeekV2Tokenizer, etc.) are properly handled:
+
+    - All custom types use _HuggingFaceTokenizer internally
+    - Support for HuggingFace Hub IDs (e.g., meta-llama/Llama-2-7b-hf)
+    - Consistent behavior between legacy and new architectures
+
+    Without this patch:
+    -------------------
+    - tokenizer_type: Llama2Tokenizer
+      tokenizer_model: meta-llama/Llama-2-7b-hf
+      → Calls Megatron's _Llama2Tokenizer
+      → Expects local file path
+      → ❌ FileNotFoundError
+
+    With this patch:
+    ----------------
+    - tokenizer_type: Llama2Tokenizer
+      tokenizer_model: meta-llama/Llama-2-7b-hf
+      → Calls Primus build_tokenizer
+      → Maps to _HuggingFaceTokenizer
+      → Supports Hub ID
+      → ✅ Success
+    """
+    try:
+        import megatron.training.global_vars as megatron_global_vars
+        import pretrain_gpt
+    except ImportError as e:
+        log_rank_0(
+            f"[Patch:megatron.tokenizer.build_tokenizer_override] "
+            f"Skip patch (Megatron not available): {e}"
+        )
+        return
+
+    # Import Primus build_tokenizer
+    from primus.backends.megatron.training.tokenizer.tokenizer import (
+        build_tokenizer as primus_build_tokenizer,
+    )
+
+    # Save original for reference (optional)
+    if not hasattr(megatron_global_vars, "_original_build_tokenizer"):
+        megatron_global_vars._original_build_tokenizer = megatron_global_vars.build_tokenizer
+    if not hasattr(pretrain_gpt, "_original_build_tokenizer"):
+        pretrain_gpt._original_build_tokenizer = pretrain_gpt.build_tokenizer
+
+    # Replace Megatron's build_tokenizer with Primus version
+    megatron_global_vars.build_tokenizer = primus_build_tokenizer
+    pretrain_gpt.build_tokenizer = primus_build_tokenizer
+
+    log_rank_0(
+        "[Patch:megatron.tokenizer.build_tokenizer_override] "
+        "✓ Replaced Megatron build_tokenizer with Primus version"
+    )
diff --git a/primus/cli/subcommands/train.py b/primus/cli/subcommands/train.py
@@ -17,7 +17,7 @@ def _resolve_pretrain_runtime(args) -> str:
 
     Priority:
       1) Explicit env override via PRIMUS_TRAIN_RUNTIME
-      2) Auto-detect by backend framework (TorchTitan -> core, others -> legacy)
+      2) Auto-detect by backend framework (TorchTitan Megatron -> core, others -> legacy)
     """
     runtime_entry = getenv("PRIMUS_TRAIN_RUNTIME", "").strip().lower()
     if runtime_entry in ("legacy", "core"):
@@ -38,7 +38,8 @@ def _resolve_pretrain_runtime(args) -> str:
     except Exception:
         framework = None
 
-    return "core" if framework == "torchtitan" else "legacy"
+    supported_frameworks = ["torchtitan", "megatron"]
+    return "core" if framework in supported_frameworks else "legacy"
 
 
 def run(args, overrides: List[str]):