Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
<!--
IMPORTANT: If this PR targets the `main` branch, it must come from the `dev` branch.
PRs to main from other branches will be rejected.
-->

# What does this PR do?

<!--
Expand Down Expand Up @@ -30,4 +35,4 @@ Fixes # (issue)
Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ -->
<!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ -->
15 changes: 15 additions & 0 deletions .github/workflows/pr-rules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: Check PR Source Branch
on:
pull_request:
branches:
- main

jobs:
check-branch:
runs-on: ubuntu-latest
steps:
- name: Check PR source branch
if: github.base_ref == 'main' && github.head_ref != 'dev'
run: |
echo "ERROR: PRs to main must come from dev branch"
exit 1
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ Nanotron is a library for pretraining transformer models. It provides a simple a

📚 **Check out our [Ultrascale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook)** - A comprehensive guide to efficiently scale LLM training with Nanotron!

📝 **AI generated docs thanks to [DeepWiki](https://deepwiki.com/huggingface/nanotron)**

## Installation

To run the code in this project, first create a Python virtual environment using e.g. `uv`:
Expand Down Expand Up @@ -108,7 +110,7 @@ For detailed instructions on training your first model, check out our [Your Firs
torchrun --nproc_per_node=1 run_generate.py --ckpt-path checkpoints/{checkpoint_number}/ --tp 1 --pp 1
```

Increase the value of `--tp` (tensor paralle) to accelerate generation with multiple GPUs and use a larger value of `--pp` (pipeline parallel) for very large models.
Increase the value of `--tp` (tensor parallel) to accelerate generation with multiple GPUs and use a larger value of `--pp` (pipeline parallel) for very large models.

### Debugging with VSCode
To debug with VSCode, add the following configuration to your `launch.json` file:
Expand Down
14 changes: 12 additions & 2 deletions src/nanotron/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from nanotron.config.models_config import ExistingCheckpointInit, NanotronConfigs, RandomInit, SpectralMupInit
from nanotron.config.parallelism_config import ParallelismArgs
from nanotron.config.utils_config import (
InitScalingMethod,
RecomputeGranularity,
cast_str_to_pipeline_engine,
cast_str_to_torch_dtype,
Expand Down Expand Up @@ -460,6 +461,13 @@ def __post_init__(self):

if self.s3_upload is not None:
self.s3_upload.__post_init__()
if self.lighteval is not None:
if self.lighteval.eval_interval is None:
self.lighteval.eval_interval = self.checkpoints.checkpoint_interval
else:
assert (
self.lighteval.eval_interval % self.checkpoints.checkpoint_interval == 0
), f"eval_interval={self.lighteval.eval_interval} must be a multiple of checkpoint_interval={self.checkpoints.checkpoint_interval}"

# Some final sanity checks across separate arguments sections:
if self.profiler is not None and self.profiler.profiler_export_path is not None:
Expand Down Expand Up @@ -542,14 +550,15 @@ def global_batch_size(self):
def global_batch_size_in_tokens(self):
return self.global_batch_size * self.tokens.sequence_length

def save_as_yaml(self, file_path: str):
def save_as_yaml(self, file_path: str, sanity_checks: bool = True):
config_dict = serialize(self)
file_path = str(file_path)
with open(file_path, "w") as f:
yaml.dump(config_dict, f)

# Sanity test config can be reloaded
_ = get_config_from_file(file_path, config_class=self.__class__)
if sanity_checks:
_ = get_config_from_file(file_path, config_class=self.__class__)

def get_yaml(self):
config_dict = serialize(self)
Expand Down Expand Up @@ -620,6 +629,7 @@ def get_config_from_dict(
PipelineEngine: cast_str_to_pipeline_engine,
TensorParallelLinearMode: lambda x: TensorParallelLinearMode[x.upper()],
RecomputeGranularity: lambda x: RecomputeGranularity[x.upper()],
InitScalingMethod: lambda x: InitScalingMethod[x.upper()],
SamplerType: lambda x: SamplerType[x.upper()],
},
# strict_unions_match=True,
Expand Down
48 changes: 44 additions & 4 deletions src/nanotron/config/lighteval_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,22 @@ def __post_init__(self):
assert self.wandb_project != "", "Please specify a wandb_project"


@dataclass
class LightEvalSlurm:
"""Arguments related to SLURM configuration for LightEval"""

gpus_per_node: int = 8
partition: str = "hopper-prod"
hf_cache: str = "~/.cache/huggingface"
cpus_per_task: int = 88
qos: str = "low"
time: str = "24:00:00"
reservation: Optional[str] = "smollm"

def __post_init__(self):
self.hf_cache = str(Path(self.hf_cache).expanduser())


@dataclass
class LightEvalConfig:
"""Arguments related to running LightEval on checkpoints.
Expand All @@ -81,13 +97,37 @@ class LightEvalConfig:
the saved config when running LightEval after training.
"""

slurm_template: Optional[str] = None
slurm_script_dir: Optional[str] = None

checkpoints_path: Optional[str] = None
slurm_script_dir: Optional[Path] = Path("eval_results/launch-config")
logs_path: Optional[Path] = Path("eval_results/logs")
local_checkpoint_dir: Path = Path(
"/scratch"
) # Base directory for temporary checkpoint storage, will store under {local_checkpoint_dir}/{run_name}/{step}
parallelism: Optional[ParallelismArgs] = None
batch_size: Optional[int] = None
generation: Optional[Union[GenerationArgs, Dict[str, GenerationArgs]]] = None
tasks: Optional[LightEvalTasksArgs] = None
logging: Optional[LightEvalLoggingArgs] = None
wandb: Optional[LightEvalWandbLoggerConfig] = None
slurm: Optional[LightEvalSlurm] = None
s3_save_path: Optional[str] = None # should not be dependent of the run_name
output_dir: Optional[str] = None # we should sanity check that it's the same as the one in the eval_config_override
nanotron_path: Optional[str] = "./"
eval_config_override: str = None
eval_config_override: Path = None # Previously hardcoded in run_slurm_one_job
eval_interval: Optional[
int
] = None # Must be multiple of checkpoint_interval. If None, eval will be done after each checkpoint upload to s3
eval_interval_file: Optional[
Path
] = None # If specified, eval_interval will be read from this file upon the next evaluation.

def __post_init__(self):
if self.parallelism is None:
self.parallelism = ParallelismArgs(dp=1, pp=1, tp=1, tp_linear_async_communication=True)
if self.slurm is None:
self.slurm = LightEvalSlurm()
self.local_checkpoint_dir = str(Path(self.local_checkpoint_dir).expanduser())
if self.eval_interval_file is not None and Path(self.eval_interval_file).exists():
logger.warning(
f"Eval interval file {self.eval_interval_file} exists. `eval_interval` will be replaced by the value in the file upon the next evaluation. You should probably delete this file if that's not what you want."
)
14 changes: 9 additions & 5 deletions src/nanotron/config/models_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pathlib import Path
from typing import Any, List, Optional, Union

from nanotron.config.utils_config import InitScalingMethod
from nanotron.nn.attention import ALL_ATTENTION_FUNCTIONS, AttentionImplementation

# The default attention implementation to use
Expand All @@ -11,6 +12,7 @@
@dataclass
class RandomInit:
std: float
scaling_method: InitScalingMethod = InitScalingMethod.NUM_LAYERS


@dataclass
Expand Down Expand Up @@ -141,11 +143,13 @@ class Qwen2Config:
sliding_window_size: Optional[int] = None
z_loss_enabled: bool = False # Z-loss regularization https://www.jmlr.org/papers/volume24/22-1144/22-1144.pdf
z_loss_coefficient: float = 0.0001 # Default from the paper (10^-4)
no_rope_layer: Optional[int] = None # Skip rope every no_rope_layer layers (see https://arxiv.org/abs/2501.18795 https://arxiv.org/abs/2305.19466 and Llama4)
_fused_rotary_emb: bool = True
_fused_rms_norm: bool = True
_use_qkv_packed: bool = True
_use_doc_masking: bool = True
no_rope_layer: Optional[
int
] = None # Skip rope every no_rope_layer layers (see https://arxiv.org/abs/2501.18795 https://arxiv.org/abs/2305.19466 and Llama4)
_fused_rotary_emb: bool = False
_fused_rms_norm: bool = False
_use_qkv_packed: bool = False
_use_doc_masking: bool = False

# MoE configuration
moe_config: Optional[MoEConfig] = None
Expand Down
9 changes: 9 additions & 0 deletions src/nanotron/config/utils_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@ class RecomputeGranularity(Enum):
FULL = auto()


class InitScalingMethod(Enum):
NONE = auto()
NUM_LAYERS = auto()
LAYER_INDEX = auto()
MODEL_SCALE = auto()


def serialize(data) -> dict:
"""Recursively serialize a nested dataclass to a dict - do some type conversions along the way"""
if data is None:
Expand All @@ -39,6 +46,8 @@ def serialize(data) -> dict:
result[field.name] = value.name
elif isinstance(value, RecomputeGranularity):
result[field.name] = value.name
elif isinstance(value, InitScalingMethod):
result[field.name] = value.name
elif isinstance(value, SamplerType):
result[field.name] = value.name
elif isinstance(value, torch.dtype):
Expand Down
1 change: 1 addition & 0 deletions src/nanotron/data/clm_collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Uni
result["label_mask"] = np.ones((batch_size, self.sequence_length), dtype=np.bool_)

# Context Parallelism: Each CP rank gets a slice of the label_ids and label_mask
cp_rank, cp_size = dist.get_rank(self.parallel_context.cp_pg), self.parallel_context.context_parallel_size
local_slice = slice(
cp_rank * self.sequence_length // cp_size, (cp_rank + 1) * self.sequence_length // cp_size
)
Expand Down
13 changes: 13 additions & 0 deletions src/nanotron/eval/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Nanotron Evaluation

This directory contains code for evaluating models trained with Nanotron.

## Installation

To use the evaluation functionality, you need to install the `lighteval` package:

```bash
uv pip install lighteval[dev]
```

## Usage
3 changes: 3 additions & 0 deletions src/nanotron/eval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# flake8: noqa: F401

from .one_job_runner import LightEvalRunner
Loading