|
| 1 | +############################################################################### |
| 2 | +# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. |
| 3 | +# |
| 4 | +# See LICENSE for license information. |
| 5 | +############################################################################### |
| 6 | + |
| 7 | +""" |
| 8 | +Megatron Tokenizer Builder Patches |
| 9 | +
|
| 10 | +Override Megatron's build_tokenizer to use Primus version which properly |
| 11 | +handles custom tokenizer types (Llama2Tokenizer, Llama3Tokenizer, etc.) |
| 12 | +with HuggingFace Hub ID support. |
| 13 | +
|
| 14 | +Background: |
| 15 | +----------- |
| 16 | +Megatron's official _Llama2Tokenizer only supports local SentencePiece files, |
| 17 | +while Primus extends it to support HuggingFace Hub IDs (e.g., meta-llama/Llama-2-7b-hf). |
| 18 | +
|
| 19 | +Without this patch, the new architecture (PrimusRuntime) would call Megatron's |
| 20 | +official build_tokenizer, causing failures when using custom tokenizer types |
| 21 | +with Hub IDs. |
| 22 | +
|
| 23 | +This patch ensures both legacy and new architectures use the same tokenizer |
| 24 | +building logic. |
| 25 | +""" |
| 26 | + |
| 27 | +from primus.core.patches import PatchContext, register_patch |
| 28 | +from primus.modules.module_utils import log_rank_0 |
| 29 | + |
| 30 | + |
| 31 | +@register_patch( |
| 32 | + "megatron.tokenizer.build_tokenizer_override", |
| 33 | + backend="megatron", |
| 34 | + phase="setup", |
| 35 | + description="Override Megatron's build_tokenizer to support Primus custom tokenizer types with HuggingFace Hub IDs", |
| 36 | +) |
| 37 | +def patch_build_tokenizer_override(ctx: PatchContext): |
| 38 | + """ |
| 39 | + Monkey-patch Megatron's build_tokenizer with Primus version. |
| 40 | +
|
| 41 | + This ensures that custom tokenizer types (Llama2Tokenizer, Llama3Tokenizer, |
| 42 | + DeepSeekV2Tokenizer, etc.) are properly handled: |
| 43 | +
|
| 44 | + - All custom types use _HuggingFaceTokenizer internally |
| 45 | + - Support for HuggingFace Hub IDs (e.g., meta-llama/Llama-2-7b-hf) |
| 46 | + - Consistent behavior between legacy and new architectures |
| 47 | +
|
| 48 | + Without this patch: |
| 49 | + ------------------- |
| 50 | + - tokenizer_type: Llama2Tokenizer |
| 51 | + tokenizer_model: meta-llama/Llama-2-7b-hf |
| 52 | + → Calls Megatron's _Llama2Tokenizer |
| 53 | + → Expects local file path |
| 54 | + → ❌ FileNotFoundError |
| 55 | +
|
| 56 | + With this patch: |
| 57 | + ---------------- |
| 58 | + - tokenizer_type: Llama2Tokenizer |
| 59 | + tokenizer_model: meta-llama/Llama-2-7b-hf |
| 60 | + → Calls Primus build_tokenizer |
| 61 | + → Maps to _HuggingFaceTokenizer |
| 62 | + → Supports Hub ID |
| 63 | + → ✅ Success |
| 64 | + """ |
| 65 | + try: |
| 66 | + import megatron.training.global_vars as megatron_global_vars |
| 67 | + import pretrain_gpt |
| 68 | + except ImportError as e: |
| 69 | + log_rank_0( |
| 70 | + f"[Patch:megatron.tokenizer.build_tokenizer_override] " |
| 71 | + f"Skip patch (Megatron not available): {e}" |
| 72 | + ) |
| 73 | + return |
| 74 | + |
| 75 | + # Import Primus build_tokenizer |
| 76 | + from primus.backends.megatron.training.tokenizer.tokenizer import ( |
| 77 | + build_tokenizer as primus_build_tokenizer, |
| 78 | + ) |
| 79 | + |
| 80 | + # Save original for reference (optional) |
| 81 | + if not hasattr(megatron_global_vars, "_original_build_tokenizer"): |
| 82 | + megatron_global_vars._original_build_tokenizer = megatron_global_vars.build_tokenizer |
| 83 | + if not hasattr(pretrain_gpt, "_original_build_tokenizer"): |
| 84 | + pretrain_gpt._original_build_tokenizer = pretrain_gpt.build_tokenizer |
| 85 | + |
| 86 | + # Replace Megatron's build_tokenizer with Primus version |
| 87 | + megatron_global_vars.build_tokenizer = primus_build_tokenizer |
| 88 | + pretrain_gpt.build_tokenizer = primus_build_tokenizer |
| 89 | + |
| 90 | + log_rank_0( |
| 91 | + "[Patch:megatron.tokenizer.build_tokenizer_override] " |
| 92 | + "✓ Replaced Megatron build_tokenizer with Primus version" |
| 93 | + ) |
0 commit comments