diff --git a/tests/test_token_ids_unique.py b/tests/test_token_ids_unique.py new file mode 100644 index 0000000..08e534d --- /dev/null +++ b/tests/test_token_ids_unique.py @@ -0,0 +1,40 @@ +# tests/test_token_ids_unique.py +# Checks that token IDs are unique. We don't check token "names" (dict keys are unique by definition). + +import pytest +import tiktoken +from collections import defaultdict + +ENCODING_NAMES = tiktoken.list_encoding_names() + +@pytest.mark.parametrize("enc_name", ENCODING_NAMES) +def test_special_token_ids_are_unique(enc_name): + """ + Special tokens: no two different names should share the same token id. + """ + enc = tiktoken.get_encoding(enc_name) + sp = getattr(enc, "_special_tokens", {}) + if not sp: + pytest.skip(f"{enc_name}: no special tokens") + + id2names = defaultdict(list) + for name, tid in sp.items(): + id2names[tid].append(name) + + dups = {tid: names for tid, names in id2names.items() if len(names) > 1} + assert not dups, f"{enc_name}: duplicated special token ids: {dups}" + +@pytest.mark.parametrize("enc_name", ENCODING_NAMES) +def test_mergeable_token_ids_are_unique(enc_name): + """ + Mergeable (vocab) tokens: token ids should be unique. + Note: some builds may not expose `_mergeable_ranks` on Python side; skip in that case. + """ + enc = tiktoken.get_encoding(enc_name) + mr = getattr(enc, "_mergeable_ranks", None) + if not mr: + pytest.skip(f"{enc_name}: mergeable ranks not exposed") + + ids = list(mr.values()) + assert len(ids) == len(set(ids)), f"{enc_name}: duplicated mergeable token ids" + diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py index 02c9ee2..ce1c6bf 100644 --- a/tiktoken_ext/openai_public.py +++ b/tiktoken_ext/openai_public.py @@ -142,7 +142,12 @@ def o200k_harmony(): "<|reserved_200010|>": 200010, "<|reserved_200011|>": 200011, "<|call|>": 200012, - } | {f"<|reserved_{i}|>": i for i in range(200013, 201088)} + "<|reserved_200013|>": 200013, + "<|reserved_200014|>": 200014, + "<|reserved_200015|>": 200015, + "<|reserved_200016|>": 200016, + "<|reserved_200017|>": 200017, + } | {f"<|reserved_{i}|>": i for i in range(200019, 201088)} return { "name": name, "pat_str": pat_str,