forked from batteryphil/mamba2backbonerecursion
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexport_bpe_table.py
More file actions
109 lines (93 loc) · 3.83 KB
/
export_bpe_table.py
File metadata and controls
109 lines (93 loc) · 3.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
export_bpe_table.py — Export GPT-NeoX BPE tokenizer for bare-metal C inference.
=================================================================================
Extracts vocabulary + merge table from HuggingFace tokenizer and serializes
to a compact .bpe.bin binary format.
Format:
[4B magic "BPE\0"]
[4B vocab_size]
[4B merge_count]
[4B max_token_len]
[vocab_size entries: [2B len][len bytes utf8]]
[merge_count entries: [2B id_a][2B id_b][2B result_id]]
Usage:
python export_bpe_table.py [output.bpe.bin]
"""
import struct
import sys
from transformers import AutoTokenizer
def export_bpe(output_path: str = "tokenizer.bpe.bin") -> None:
"""Export GPT-NeoX tokenizer to binary format for bare-metal."""
print(f"Loading GPT-NeoX tokenizer...")
tok = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
tok.add_special_tokens({"additional_special_tokens": ["<THINK>", "<HALT>"]})
vocab = tok.get_vocab()
vocab_size = len(vocab)
# Build id→token mapping
id_to_token = [""] * vocab_size
for token_str, token_id in vocab.items():
if token_id < vocab_size:
id_to_token[token_id] = token_str
# Extract merges from tokenizer
merges = []
if hasattr(tok, 'bpe_ranks') and tok.bpe_ranks:
# Direct access to BPE ranks
for (a, b), rank in sorted(tok.bpe_ranks.items(), key=lambda x: x[1]):
merged = a + b
a_id = vocab.get(a, -1)
b_id = vocab.get(b, -1)
m_id = vocab.get(merged, -1)
if a_id >= 0 and b_id >= 0 and m_id >= 0:
merges.append((a_id, b_id, m_id))
elif hasattr(tok.backend_tokenizer, 'model') and hasattr(tok.backend_tokenizer.model, 'merges'):
# Tokenizers library path
for merge_str in tok.backend_tokenizer.model.merges:
parts = merge_str.split(' ')
if len(parts) == 2:
a, b = parts
merged = a + b
a_id = vocab.get(a, -1)
b_id = vocab.get(b, -1)
m_id = vocab.get(merged, -1)
if a_id >= 0 and b_id >= 0 and m_id >= 0:
merges.append((a_id, b_id, m_id))
# Find max token length
max_token_len = max(len(t.encode('utf-8', errors='replace')) for t in id_to_token)
print(f" Vocab size: {vocab_size:,}")
print(f" Merge count: {len(merges):,}")
print(f" Max token len: {max_token_len}")
print(f" <HALT> ID: {vocab.get('<HALT>', -1)}")
# Write binary
with open(output_path, 'wb') as f:
# Header: magic + sizes
f.write(b'BPE\x00')
f.write(struct.pack('<III', vocab_size, len(merges), max_token_len))
# Vocabulary: [2B len][len bytes utf8] for each token
for i, token in enumerate(id_to_token):
encoded = token.encode('utf-8', errors='replace')
f.write(struct.pack('<H', len(encoded)))
f.write(encoded)
# Merges: [2B id_a][2B id_b][2B result_id]
for a_id, b_id, m_id in merges:
f.write(struct.pack('<HHH', a_id, b_id, m_id))
import os
file_size = os.path.getsize(output_path)
print(f"\n Saved: {output_path} ({file_size:,} bytes, {file_size/1024:.0f} KB)")
# Verify round-trip
test_strs = [
"Hello, world!",
"A = blue. B = A. What is B?\\nAnswer:",
"The quick brown fox"
]
print(f"\n Round-trip verification:")
for s in test_strs:
ids = tok.encode(s, add_special_tokens=False)
decoded = tok.decode(ids)
match = "✓" if decoded.strip() == s.strip() else "✗"
print(f" {match} \"{s[:40]}\" → {len(ids)} tokens → \"{decoded[:40]}\"")
if __name__ == '__main__':
out = sys.argv[1] if len(sys.argv) > 1 else 'tokenizer.bpe.bin'
export_bpe(out)
"""
Export GPT-NeoX BPE tokenizer for bare-metal C inference.
"""