forked from batteryphil/mamba2backbonerecursion
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathphase2_joint_training.py
More file actions
84 lines (64 loc) · 3.15 KB
/
phase2_joint_training.py
File metadata and controls
84 lines (64 loc) · 3.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import torch
import os
from torch.utils.data import DataLoader
from mamba_ssm import MambaLMHeadModel
from mamba1_engine import RecursiveMamba1_PrefixScratchpad, MODEL_ID
from dataset_rlf import RLFAdversarialDataset, collate_rlf
def train():
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Starting Mamba-130m Phase 2: Joint Training on {device}")
os.makedirs("saved_weights", exist_ok=True)
# Load Base
backbone = MambaLMHeadModel.from_pretrained(MODEL_ID, dtype=torch.bfloat16, device=device)
model = RecursiveMamba1_PrefixScratchpad(backbone, lora_rank=4).to(device)
# Load Phase 1 checkpoint
p1_ckpt = "saved_weights/mamba130m_phase1_scratchpad.pt"
if os.path.exists(p1_ckpt):
model.load_state_dict(torch.load(p1_ckpt, map_location=device))
print(f"Successfully loaded Phase 1 checkpoint: {p1_ckpt}")
else:
print(f"ERROR: Could not find {p1_ckpt}. Please run Phase 1 first.")
return
# Unfreeze LoRA
for name, param in model.named_parameters():
if "lora" in name.lower():
param.requires_grad = True
# Optimizer for joint phase
params = [
{"params": [p for n, p in model.named_parameters() if p.requires_grad], "lr": 1e-4, "weight_decay": 0.01}
]
optimizer = torch.optim.AdamW(params)
# Phase 2 uses CLEAN data (no adversarial distractors)
dataset = RLFAdversarialDataset(size=12000, seq_len=512, mode="clean")
loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_rlf, drop_last=True)
model.train()
step = 0
total_steps = 3000
recent_accs = []
while step < total_steps:
for inputs, targets, starts in loader:
if step >= total_steps:
break
inputs = inputs.to(device)
optimizer.zero_grad()
with torch.autocast(device_type=device, dtype=torch.bfloat16):
loss, acc, ans_acc, halt_acc = model(inputs, chain_targets=targets, ans_starts=starts)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
if step % 20 == 0:
print(f"Phase 2 Step {step} | Loss {loss.item():.4f} | RLF Acc {acc:.2f} | Ans Acc {ans_acc:.2f} | Halt Acc {halt_acc:.2f}")
if step > 0 and step % 500 == 0:
torch.save(model.state_dict(), f"saved_weights/mamba130m_phase2_joint_step{step}.pt")
recent_accs.append(acc)
if len(recent_accs) > 50:
recent_accs.pop(0)
avg_acc = sum(recent_accs) / len(recent_accs)
if step > 500 and avg_acc >= 0.97:
print(f"Early stopping at step {step}! Moving average RLF Acc reached {avg_acc:.3f} >= 0.97")
break
step += 1
torch.save(model.state_dict(), "saved_weights/mamba130m_phase2_joint_best.pt")
print("Phase 2 Complete -> saved_weights/mamba130m_phase2_joint_best.pt")
if __name__ == "__main__":
train()