huggingface · duynht · Apr 14, 2025 · Apr 18, 2025 · Apr 24, 2025 · Apr 26, 2025
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,3 +1,8 @@
+<!--
+IMPORTANT: If this PR targets the `main` branch, it must come from the `dev` branch.
+PRs to main from other branches will be rejected.
+-->
+
 # What does this PR do?
 
 <!--
@@ -30,4 +35,4 @@ Fixes # (issue)
 Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
 members/contributors who may be interested in your PR.
 
-<!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ -->
+<!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ -->
diff --git a/.github/workflows/pr-rules.yaml b/.github/workflows/pr-rules.yaml
@@ -0,0 +1,15 @@
+name: Check PR Source Branch
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  check-branch:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check PR source branch
+        if: github.base_ref == 'main' && github.head_ref != 'dev'
+        run: |
+          echo "ERROR: PRs to main must come from dev branch"
+          exit 1
diff --git a/README.md b/README.md
@@ -33,6 +33,8 @@ Nanotron is a library for pretraining transformer models. It provides a simple a
 
 📚 **Check out our [Ultrascale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook)** - A comprehensive guide to efficiently scale LLM training with Nanotron!
 
+📝 **AI generated docs thanks to [DeepWiki](https://deepwiki.com/huggingface/nanotron)**
+
 ## Installation
 
 To run the code in this project, first create a Python virtual environment using e.g. `uv`:
@@ -108,7 +110,7 @@ For detailed instructions on training your first model, check out our [Your Firs
 torchrun --nproc_per_node=1 run_generate.py --ckpt-path checkpoints/{checkpoint_number}/ --tp 1 --pp 1
 ```
 
-Increase the value of `--tp` (tensor paralle) to accelerate generation with multiple GPUs and use a larger value of `--pp` (pipeline parallel) for very large models.
+Increase the value of `--tp` (tensor parallel) to accelerate generation with multiple GPUs and use a larger value of `--pp` (pipeline parallel) for very large models.
 
 ### Debugging with VSCode
 To debug with VSCode, add the following configuration to your `launch.json` file:

diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
@@ -17,6 +17,7 @@
 from nanotron.config.models_config import ExistingCheckpointInit, NanotronConfigs, RandomInit, SpectralMupInit
 from nanotron.config.parallelism_config import ParallelismArgs
 from nanotron.config.utils_config import (
+    InitScalingMethod,
     RecomputeGranularity,
     cast_str_to_pipeline_engine,
     cast_str_to_torch_dtype,
@@ -460,6 +461,13 @@ def __post_init__(self):
 
         if self.s3_upload is not None:
             self.s3_upload.__post_init__()
+            if self.lighteval is not None:
+                if self.lighteval.eval_interval is None:
+                    self.lighteval.eval_interval = self.checkpoints.checkpoint_interval
+                else:
+                    assert (
+                        self.lighteval.eval_interval % self.checkpoints.checkpoint_interval == 0
+                    ), f"eval_interval={self.lighteval.eval_interval} must be a multiple of checkpoint_interval={self.checkpoints.checkpoint_interval}"
 
         # Some final sanity checks across separate arguments sections:
         if self.profiler is not None and self.profiler.profiler_export_path is not None:
@@ -542,14 +550,15 @@ def global_batch_size(self):
     def global_batch_size_in_tokens(self):
         return self.global_batch_size * self.tokens.sequence_length
 
-    def save_as_yaml(self, file_path: str):
+    def save_as_yaml(self, file_path: str, sanity_checks: bool = True):
         config_dict = serialize(self)
         file_path = str(file_path)
         with open(file_path, "w") as f:
             yaml.dump(config_dict, f)
 
         # Sanity test config can be reloaded
-        _ = get_config_from_file(file_path, config_class=self.__class__)
+        if sanity_checks:
+            _ = get_config_from_file(file_path, config_class=self.__class__)
 
     def get_yaml(self):
         config_dict = serialize(self)
@@ -620,6 +629,7 @@ def get_config_from_dict(
                 PipelineEngine: cast_str_to_pipeline_engine,
                 TensorParallelLinearMode: lambda x: TensorParallelLinearMode[x.upper()],
                 RecomputeGranularity: lambda x: RecomputeGranularity[x.upper()],
+                InitScalingMethod: lambda x: InitScalingMethod[x.upper()],
                 SamplerType: lambda x: SamplerType[x.upper()],
             },
             # strict_unions_match=True,

diff --git a/src/nanotron/config/lighteval_config.py b/src/nanotron/config/lighteval_config.py
@@ -73,6 +73,22 @@ def __post_init__(self):
         assert self.wandb_project != "", "Please specify a wandb_project"
 
 
+@dataclass
+class LightEvalSlurm:
+    """Arguments related to SLURM configuration for LightEval"""
+
+    gpus_per_node: int = 8
+    partition: str = "hopper-prod"
+    hf_cache: str = "~/.cache/huggingface"
+    cpus_per_task: int = 88
+    qos: str = "low"
+    time: str = "24:00:00"
+    reservation: Optional[str] = "smollm"
+
+    def __post_init__(self):
+        self.hf_cache = str(Path(self.hf_cache).expanduser())
+
+
 @dataclass
 class LightEvalConfig:
     """Arguments related to running LightEval on checkpoints.
@@ -81,13 +97,37 @@ class LightEvalConfig:
     the saved config when running LightEval after training.
     """
 
-    slurm_template: Optional[str] = None
-    slurm_script_dir: Optional[str] = None
-
-    checkpoints_path: Optional[str] = None
+    slurm_script_dir: Optional[Path] = Path("eval_results/launch-config")
+    logs_path: Optional[Path] = Path("eval_results/logs")
+    local_checkpoint_dir: Path = Path(
+        "/scratch"
+    )  # Base directory for temporary checkpoint storage, will store under {local_checkpoint_dir}/{run_name}/{step}
     parallelism: Optional[ParallelismArgs] = None
     batch_size: Optional[int] = None
     generation: Optional[Union[GenerationArgs, Dict[str, GenerationArgs]]] = None
     tasks: Optional[LightEvalTasksArgs] = None
     logging: Optional[LightEvalLoggingArgs] = None
     wandb: Optional[LightEvalWandbLoggerConfig] = None
+    slurm: Optional[LightEvalSlurm] = None
+    s3_save_path: Optional[str] = None # should not be dependent of the run_name
+    output_dir: Optional[str] = None # we should sanity check that it's the same as the one in the eval_config_override
+    nanotron_path: Optional[str] = "./"
+    eval_config_override: str = None
+    eval_config_override: Path = None  # Previously hardcoded in run_slurm_one_job
+    eval_interval: Optional[
+        int
+    ] = None  # Must be multiple of checkpoint_interval. If None, eval will be done after each checkpoint upload to s3
+    eval_interval_file: Optional[
+        Path
+    ] = None  # If specified, eval_interval will be read from this file upon the next evaluation.
+
+    def __post_init__(self):
+        if self.parallelism is None:
+            self.parallelism = ParallelismArgs(dp=1, pp=1, tp=1, tp_linear_async_communication=True)
+        if self.slurm is None:
+            self.slurm = LightEvalSlurm()
+        self.local_checkpoint_dir = str(Path(self.local_checkpoint_dir).expanduser())
+        if self.eval_interval_file is not None and Path(self.eval_interval_file).exists():
+            logger.warning(
+                f"Eval interval file {self.eval_interval_file} exists. `eval_interval` will be replaced by the value in the file upon the next evaluation. You should probably delete this file if that's not what you want."
+            )
diff --git a/src/nanotron/config/models_config.py b/src/nanotron/config/models_config.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 from typing import Any, List, Optional, Union
 
+from nanotron.config.utils_config import InitScalingMethod
 from nanotron.nn.attention import ALL_ATTENTION_FUNCTIONS, AttentionImplementation
 
 # The default attention implementation to use
@@ -11,6 +12,7 @@
 @dataclass
 class RandomInit:
     std: float
+    scaling_method: InitScalingMethod = InitScalingMethod.NUM_LAYERS
 
 
 @dataclass
@@ -141,11 +143,13 @@ class Qwen2Config:
     sliding_window_size: Optional[int] = None
     z_loss_enabled: bool = False  # Z-loss regularization https://www.jmlr.org/papers/volume24/22-1144/22-1144.pdf
     z_loss_coefficient: float = 0.0001  # Default from the paper (10^-4)
-    no_rope_layer: Optional[int] = None  # Skip rope every no_rope_layer layers (see https://arxiv.org/abs/2501.18795 https://arxiv.org/abs/2305.19466 and Llama4)
-    _fused_rotary_emb: bool = True
-    _fused_rms_norm: bool = True
-    _use_qkv_packed: bool = True
-    _use_doc_masking: bool = True
+    no_rope_layer: Optional[
+        int
+    ] = None  # Skip rope every no_rope_layer layers (see https://arxiv.org/abs/2501.18795 https://arxiv.org/abs/2305.19466 and Llama4)
+    _fused_rotary_emb: bool = False
+    _fused_rms_norm: bool = False
+    _use_qkv_packed: bool = False
+    _use_doc_masking: bool = False
 
     # MoE configuration
     moe_config: Optional[MoEConfig] = None

diff --git a/src/nanotron/config/utils_config.py b/src/nanotron/config/utils_config.py
@@ -18,6 +18,13 @@ class RecomputeGranularity(Enum):
     FULL = auto()
 
 
+class InitScalingMethod(Enum):
+    NONE = auto()
+    NUM_LAYERS = auto()
+    LAYER_INDEX = auto()
+    MODEL_SCALE = auto()
+
+
 def serialize(data) -> dict:
     """Recursively serialize a nested dataclass to a dict - do some type conversions along the way"""
     if data is None:
@@ -39,6 +46,8 @@ def serialize(data) -> dict:
             result[field.name] = value.name
         elif isinstance(value, RecomputeGranularity):
             result[field.name] = value.name
+        elif isinstance(value, InitScalingMethod):
+            result[field.name] = value.name
         elif isinstance(value, SamplerType):
             result[field.name] = value.name
         elif isinstance(value, torch.dtype):

diff --git a/src/nanotron/data/clm_collator.py b/src/nanotron/data/clm_collator.py
@@ -97,6 +97,7 @@ def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Uni
                 result["label_mask"] = np.ones((batch_size, self.sequence_length), dtype=np.bool_)
 
             # Context Parallelism: Each CP rank gets a slice of the label_ids and label_mask
+            cp_rank, cp_size = dist.get_rank(self.parallel_context.cp_pg), self.parallel_context.context_parallel_size
             local_slice = slice(
                 cp_rank * self.sequence_length // cp_size, (cp_rank + 1) * self.sequence_length // cp_size
             )

diff --git a/src/nanotron/eval/README.md b/src/nanotron/eval/README.md
@@ -0,0 +1,13 @@
+# Nanotron Evaluation
+
+This directory contains code for evaluating models trained with Nanotron.
+
+## Installation
+
+To use the evaluation functionality, you need to install the `lighteval` package:
+
+```bash
+uv pip install lighteval[dev]
+```
+
+## Usage
diff --git a/src/nanotron/eval/__init__.py b/src/nanotron/eval/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa: F401
+
+from .one_job_runner import LightEvalRunner
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# flake8: noqa: F401

		from .one_job_runner import LightEvalRunner