From 86466405e0428df338edb885a87cc0a24634da06 Mon Sep 17 00:00:00 2001
From: Shikhar Goswami <shikhargoswami2308@gmail.com>
Date: Wed, 17 Mar 2021 20:26:47 +0530
Subject: [PATCH 1/7] Adding GPT2 tokenizer to WordEmbeddings

---
 Manifest.toml                     | 161 +++++++++++++++++++++++
 Project.toml                      |   5 +-
 src/WordTokenizers.jl             |   5 +-
 src/statistical/Vocab_DataDeps.jl |  17 ++-
 src/statistical/gpt2tokenizer.jl  | 211 ++++++++++++++++++++++++++++++
 5 files changed, 394 insertions(+), 5 deletions(-)
 create mode 100644 Manifest.toml
 create mode 100644 src/statistical/gpt2tokenizer.jl

diff --git a/Manifest.toml b/Manifest.toml
new file mode 100644
index 0000000..7c27a9c
--- /dev/null
+++ b/Manifest.toml
@@ -0,0 +1,161 @@
+# This file is machine-generated - editing it directly is not advised
+
+[[Artifacts]]
+deps = ["Pkg"]
+git-tree-sha1 = "c30985d8821e0cd73870b17b0ed0ce6dc44cb744"
+uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
+version = "1.3.0"
+
+[[Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
+[[BinaryProvider]]
+deps = ["Libdl", "Logging", "SHA"]
+git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
+uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
+version = "0.5.10"
+
+[[DataDeps]]
+deps = ["BinaryProvider", "HTTP", "Libdl", "Reexport", "SHA", "p7zip_jll"]
+git-tree-sha1 = "4f0e41ff461d42cfc62ff0de4f1cd44c6e6b3771"
+uuid = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
+version = "0.7.7"
+
+[[Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[Distributed]]
+deps = ["Random", "Serialization", "Sockets"]
+uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+
+[[HTML_Entities]]
+deps = ["StrTables"]
+git-tree-sha1 = "aa19515d6ebe7f91a39cfc1dc6341f38fcac1282"
+uuid = "7693890a-d069-55fe-a829-b4a6d304f0ee"
+version = "1.0.0"
+
+[[HTTP]]
+deps = ["Base64", "Dates", "IniFile", "MbedTLS", "NetworkOptions", "Sockets", "URIs"]
+git-tree-sha1 = "c9f380c76d8aaa1fa7ea9cf97bddbc0d5b15adc2"
+uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+version = "0.9.5"
+
+[[IniFile]]
+deps = ["Test"]
+git-tree-sha1 = "098e4d2c533924c921f9f9847274f2ad89e018b8"
+uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f"
+version = "0.5.0"
+
+[[InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[JLLWrappers]]
+git-tree-sha1 = "a431f5f2ca3f4feef3bd7a5e94b8b8d4f2f647a0"
+uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
+version = "1.2.0"
+
+[[JSON]]
+deps = ["Dates", "Mmap", "Parsers", "Unicode"]
+git-tree-sha1 = "81690084b6198a2e1da36fcfda16eeca9f9f24e4"
+uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+version = "0.21.1"
+
+[[LibGit2]]
+deps = ["Printf"]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
+[[Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[MbedTLS]]
+deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"]
+git-tree-sha1 = "1c38e51c3d08ef2278062ebceade0e46cefc96fe"
+uuid = "739be429-bea8-5141-9913-cc70e7f3736d"
+version = "1.0.3"
+
+[[MbedTLS_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "0eef589dd1c26a3ac9d753fe1a8bcad63f956fa6"
+uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
+version = "2.16.8+1"
+
+[[Mmap]]
+uuid = "a63ad114-7e13-5084-954f-fe012c677804"
+
+[[NetworkOptions]]
+git-tree-sha1 = "ed3157f48a05543cce9b241e1f2815f7e843d96e"
+uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
+version = "1.2.0"
+
+[[Parsers]]
+deps = ["Dates"]
+git-tree-sha1 = "223a825cccef2228f3fdbf2ecc7ca93363059073"
+uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
+version = "1.0.16"
+
+[[Pkg]]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+
+[[Printf]]
+deps = ["Unicode"]
+uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[[REPL]]
+deps = ["InteractiveUtils", "Markdown", "Sockets"]
+uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+
+[[Random]]
+deps = ["Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[Reexport]]
+git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5"
+uuid = "189a3867-3050-52da-a836-e630ba90ab69"
+version = "1.0.0"
+
+[[SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+
+[[Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
+[[StrTables]]
+deps = ["Dates"]
+git-tree-sha1 = "5998faae8c6308acc25c25896562a1e66a3bb038"
+uuid = "9700d1a9-a7c8-5760-9816-a99fda30bb8f"
+version = "1.0.1"
+
+[[Test]]
+deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
+uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[[URIs]]
+git-tree-sha1 = "7855809b88d7b16e9b029afd17880930626f54a2"
+uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
+version = "1.2.0"
+
+[[UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[Unicode]]
+uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+
+[[p7zip_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "ee65cfa19bea645698a0224bfa216f2b1c8b559f"
+uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
+version = "16.2.0+3"
diff --git a/Project.toml b/Project.toml
index a81d92f..5b03149 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,14 +5,15 @@ version = "0.5.6"
 [deps]
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 HTML_Entities = "7693890a-d069-55fe-a829-b4a6d304f0ee"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 StrTables = "9700d1a9-a7c8-5760-9816-a99fda30bb8f"
 Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [compat]
 DataDeps = "0.6.5, 0.7"
-julia = "1"
-HTML_Entities= "1"
+HTML_Entities = "1"
 StrTables = "1"
+julia = "1"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl
index e25eb73..248fba5 100644
--- a/src/WordTokenizers.jl
+++ b/src/WordTokenizers.jl
@@ -4,7 +4,7 @@ module WordTokenizers
 using HTML_Entities
 using StrTables
 using Unicode
-using DataDeps
+using DataDeps, JSON, InternedStrings
 
 abstract type PretrainedTokenizer end
 
@@ -17,7 +17,7 @@ export poormans_tokenize, punctuation_space_tokenize,
        set_tokenizer, set_sentence_splitter,
        rev_tokenize, rev_detokenize,
        toktok_tokenize
-export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens
+export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens, GPT2, GPT2Tokenizer, tokenize
 export PretrainedTokenizer, tokenizer_files
 include("words/fast.jl")
 
@@ -33,6 +33,7 @@ include("set_method_api.jl")
 include("split_api.jl")
 
 include("statistical/unigram.jl")
+include("statistical/gpt2tokenizer.jl")
 
 const pretrained = Dict{DataType, Vector{String}}()
 function tokenizer_files(::Type{T}) where T<:PretrainedTokenizer
diff --git a/src/statistical/Vocab_DataDeps.jl b/src/statistical/Vocab_DataDeps.jl
index d935ba7..ad09dd2 100644
--- a/src/statistical/Vocab_DataDeps.jl
+++ b/src/statistical/Vocab_DataDeps.jl
@@ -1,5 +1,6 @@
 abstract type ALBERT_V1 <: PretrainedTokenizer end
 abstract type ALBERT_V2 <: PretrainedTokenizer end
+abstract type GPT2 <: PretrainedTokenizer end
 
 const vectors_albertversion1 = [
     ("albert_base_v1_30k-clean.vocab",
@@ -40,6 +41,8 @@ const vectors_albertversion2 = [
     "https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_xxlarge_v2_30k-clean.vocab")
 ]
 
+const vectors_gpt2 = ["encoder.json", "vocab.bpe"]
+
 function init_vocab_datadeps()
     for (depname, description, sha, link) in vectors_albertversion1
         register(DataDep(depname,
@@ -70,5 +73,17 @@ function init_vocab_datadeps()
                  ))
         append!(tokenizer_files(ALBERT_V2), ["$depname"])
     end
-end
 
+    register(DataDep("GPT2",
+    """
+    Pretrained gpt2 vocabulary and merges file by Open AI.
+    Website: https://openai.com/blog/better-language-models/
+    Author: Radford et al
+    Licence: MIT
+    All GPT2 Models are trained on same size vocabulary.
+    """,
+    ["https://openaipublic.blob.core.windows.net/gpt-2/models/117M/$(file)" for file in vectors_gpt2],
+    "05805f21f823300551adf0646abe905eb036fb272f97c279f0d9c656c845ca46"))
+
+    append!(tokenizer_files(GPT2), ["GPT2/$(file)" for file in vectors_gpt2])
+end
diff --git a/src/statistical/gpt2tokenizer.jl b/src/statistical/gpt2tokenizer.jl
new file mode 100644
index 0000000..52270f6
--- /dev/null
+++ b/src/statistical/gpt2tokenizer.jl
@@ -0,0 +1,211 @@
+"""
+struct GPT2Tokenizer
+    vocab::Dict{String, Any}
+    rank::Dict{Pair{String,String}, Int}
+    cache::Dict{String, Tuple}
+    pat::Regex
+end
+structure, To hold pretrained vocabulary map and merge rules for GPT2
+"""
+struct GPT2Tokenizer
+    vocab::Dict{String, Any}
+    rank::Dict{Pair{String,String}, Int}
+    cache::Dict{String, Tuple}
+    pat::Regex
+
+    function GPT2Tokenizer(::Type{T};pat=r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+") where T<:PretrainedTokenizer
+
+        vocab_file = @datadep_str tokenizer_files(T)[1]
+        bfile = @datadep_str tokenizer_files(T)[2]
+
+        vocab = Dict{String, Any}()
+        rank = Dict{Pair{String, String}, Int}()
+        cache = Dict{String, Tuple}()
+
+        vocab = JSON.parsefile(vocab_file)
+
+        open(bfile) do f
+            for (i, line) ∈ enumerate(eachline(f))
+                if i==1
+                    identity
+                else
+                    pair = Pair(split(line," ")...)
+                    rank[pair] = i-1
+                end
+            end
+        end
+        new(vocab, rank, cache, pat)
+    end
+end
+
+"""
+load(ty::Type{T}) where T<:PretrainedTokenizer
+Initializes the GPT2Tokenizer and loads the vocab and merges files from `DataDeps`
+#Example
+```julia-repl
+julia> tokenizer = load(GPT2)
+GPT2Tokenizer(Dict{String,Any}("ilet" => 41550,"ĠVer" => 4643,"599" => 43452,"ĠRubin" => 34599,"Ġwrestler" => 34845,"Ġsharp" => 7786,"ĠObst" => 46378,"Ġlover" => 18854,"Core" => 14055,"Ġro" => 686…), Dict(("Ġne" => "ver") => 984,("ĠP" => "helps") => 40332,("Ġrapid" => "ly") => 8647,("s" => "af") => 49330,("Ġsn" => "ack") => 26651,("ra" => "ft") => 1362,("ĠCloud" => "s") => 46043,("Ġbrill" => "iant") => 10202,("Ġconsequ" => "ence") => 12666,("Ġplug" => "in") => 13622…), Dict{String,Tuple}(), r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
+```
+"""
+function load(ty::Type{T}) where T<:PretrainedTokenizer
+    GPT2Tokenizer(T)
+end
+
+"""
+Returns Dictionary of utf-8 encoding and corresponding unicode strings for Byte-Pair Encoding.
+"""
+function bytes_to_unicode()
+    bs = [33:255...]
+    cs = bs[:]
+    n=0
+    for b in 0:255
+        if b ∉ bs
+            append!(bs, b)
+            append!(cs, 256+n)
+            n+=1
+        end
+    end
+    cs = [Char(n) for n in cs]
+    Dict(zip(bs,cs))
+end
+
+toStrTuple(x::Vector{String})=toStrTuple(join(x))
+function toStrTuple(x::AbstractString)
+    fs = intern.(split(chop(x), ""))
+    push!(fs, intern(x[end]*""))
+    filter!((x)-> x != "", fs)
+    Tuple(fs)
+end
+
+"""
+get_pairs(word::NTuple{})
+Returns set of pairs in a word. Word is a tuple of strings.
+"""
+function get_pairs(word::NTuple{})
+    pairs = Set{Pair{}}()
+    prev_char = word[1]
+    for char in word[2:end]
+        push!(pairs, Pair(prev_char, char))
+        prev_char = char
+    end
+    pairs
+end
+
+lowestpair(pairs::Set{Pair{}},tokenizer::GPT2Tokenizer) = lowestpair(collect(pairs), tokenizer::GPT2Tokenizer)
+lowestpair(pairs::Vector{Pair{}}, tokenizer::GPT2Tokenizer) = argmin(
+    sizehint!(Dict(
+    map(pairs) do p
+        p=>get(tokenizer.rank, p, typemax(Int))
+    end),
+          length(pairs))
+    )
+
+
+function bpe(token::String, tokenizer::GPT2Tokenizer)
+
+    haskey(tokenizer.cache, token) && return tokenizer.cache[token]
+    word = toStrTuple(token)
+    pairs = get_pairs(word)
+    isempty(pairs) && return token
+
+    while true
+        pair = lowestpair(pairs, tokenizer)
+        !haskey(tokenizer.rank, pair) && break
+        first, second = pair
+        new_word=Vector{String}()
+        i=1
+
+        while i <= length(word)
+
+            try
+                j = findnext(isequal(first), word, i)
+                append!(new_word, word[i:j-1])
+                i=j
+            catch
+                append!(new_word,word[i:end])
+                break
+            end
+
+            if word[i]==first && i<=length(word)-1 && word[i+1]==second
+                push!(new_word, first*second)
+                i+=2
+            else
+                push!(new_word, word[i])
+                i+=1
+            end
+        end
+        new_word = Tuple(new_word)
+        word = new_word
+
+        if length(word)==1
+            break
+        else
+            pairs = get_pairs(word)
+        end
+    end
+    tokenizer.cache[token] = word
+    word
+end
+
+"""
+tokenize(text::String, tokenizer::GPT2Tokenizer)
+Implements tokenization of input text. This tokenizer doesn't include unknown and special tokens because
+of its byte-level BPE tokenization. GPT2 model is only trained on end token `<|endoftext|>`. Has to be
+manually added after the tokenization.
+GPT2 Tokenizer treats whitespace as unicode character `\u0120 (Ġ)` before a word.
+
+# Example
+```julia-repl
+julia> tokens = tokenize("Hi! How you doin", tokenizer)
+6-element Array{String,1}:
+ "Hi"
+ "!"
+ "ĠHow"
+ "Ġyou"
+ "Ġdo"
+ "in"
+```
+"""
+function tokenize(text::String, tokenizer::GPT2Tokenizer)
+    mapping = bytes_to_unicode()
+    tokens=Vector{String}()
+    matches = map(eachmatch(tokenizer.pat, text)) do m
+        m.match
+    end
+    for token in matches
+        token = join([mapping[Int(b)] for b in token])
+        append!(tokens, [string(bpe_token) for bpe_token in bpe(token, tokenizer)])
+    end
+    tokens
+end
+
+"""
+ids_from_tokens(tokens::Vector{String}, tokenizer::GPT2Tokenizer)
+Returns respective ids of tokens from pretrained vocabulary map
+
+# Example
+```julia-repl
+julia> tokens = tokenize("Hi! How you doin", tokenizer)
+6-element Array{String,1}:
+ "Hi"
+ "!"
+ "ĠHow"
+ "Ġyou"
+ "Ġdo"
+ "in"
+
+julia> ids_from_tokens(tokens, tokenizer)
+6-element Array{Int64,1}:
+ 17250
+     0
+  1374
+   345
+   466
+   259
+```
+"""
+function ids_from_tokens(tokens::Vector{String}, tokenizer::GPT2Tokenizer)
+    map(tokens) do x
+        last(get(tokenizer.vocab, x, 0))
+    end
+end

From 179c517ca15756e0a565511274f5f85244f60fa8 Mon Sep 17 00:00:00 2001
From: Shikhar Goswami <shikhargoswami2308@gmail.com>
Date: Wed, 17 Mar 2021 21:12:11 +0530
Subject: [PATCH 2/7] Added tests

---
 Manifest.toml                    |  6 ++++++
 Project.toml                     |  1 +
 src/WordTokenizers.jl            |  2 +-
 src/statistical/gpt2tokenizer.jl |  9 ++++++++-
 test/gpt2_tokenizer.jl           | 25 +++++++++++++++++++++++++
 test/runtests.jl                 |  1 +
 6 files changed, 42 insertions(+), 2 deletions(-)
 create mode 100644 test/gpt2_tokenizer.jl

diff --git a/Manifest.toml b/Manifest.toml
index 7c27a9c..ca5eecd 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -51,6 +51,12 @@ version = "0.5.0"
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
+[[InternedStrings]]
+deps = ["Random", "Test"]
+git-tree-sha1 = "eb05b5625bc5d821b8075a77e4c421933e20c76b"
+uuid = "7d512f48-7fb1-5a58-b986-67e6dc259f01"
+version = "0.7.0"
+
 [[JLLWrappers]]
 git-tree-sha1 = "a431f5f2ca3f4feef3bd7a5e94b8b8d4f2f647a0"
 uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
diff --git a/Project.toml b/Project.toml
index 5b03149..0e55d74 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.5.6"
 [deps]
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 HTML_Entities = "7693890a-d069-55fe-a829-b4a6d304f0ee"
+InternedStrings = "7d512f48-7fb1-5a58-b986-67e6dc259f01"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 StrTables = "9700d1a9-a7c8-5760-9816-a99fda30bb8f"
 Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl
index 248fba5..fb54bb3 100644
--- a/src/WordTokenizers.jl
+++ b/src/WordTokenizers.jl
@@ -17,7 +17,7 @@ export poormans_tokenize, punctuation_space_tokenize,
        set_tokenizer, set_sentence_splitter,
        rev_tokenize, rev_detokenize,
        toktok_tokenize
-export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens, GPT2, GPT2Tokenizer, tokenize
+export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens, GPT2, GPT2Tokenizer, tokenize, sentence_from_tokens_gpt2
 export PretrainedTokenizer, tokenizer_files
 include("words/fast.jl")
 
diff --git a/src/statistical/gpt2tokenizer.jl b/src/statistical/gpt2tokenizer.jl
index 52270f6..5510c3c 100644
--- a/src/statistical/gpt2tokenizer.jl
+++ b/src/statistical/gpt2tokenizer.jl
@@ -44,7 +44,7 @@ Initializes the GPT2Tokenizer and loads the vocab and merges files from `DataDep
 #Example
 ```julia-repl
 julia> tokenizer = load(GPT2)
-GPT2Tokenizer(Dict{String,Any}("ilet" => 41550,"ĠVer" => 4643,"599" => 43452,"ĠRubin" => 34599,"Ġwrestler" => 34845,"Ġsharp" => 7786,"ĠObst" => 46378,"Ġlover" => 18854,"Core" => 14055,"Ġro" => 686…), Dict(("Ġne" => "ver") => 984,("ĠP" => "helps") => 40332,("Ġrapid" => "ly") => 8647,("s" => "af") => 49330,("Ġsn" => "ack") => 26651,("ra" => "ft") => 1362,("ĠCloud" => "s") => 46043,("Ġbrill" => "iant") => 10202,("Ġconsequ" => "ence") => 12666,("Ġplug" => "in") => 13622…), Dict{String,Tuple}(), r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
+
 ```
 """
 function load(ty::Type{T}) where T<:PretrainedTokenizer
@@ -209,3 +209,10 @@ function ids_from_tokens(tokens::Vector{String}, tokenizer::GPT2Tokenizer)
         last(get(tokenizer.vocab, x, 0))
     end
 end
+
+function sentence_from_tokens_gpt2(tk::Array{String,1})
+    sen = join(tk)
+    sen = replace(sen, "Ġ" => " ")
+    sen = strip(sen)
+    return sen
+end
diff --git a/test/gpt2_tokenizer.jl b/test/gpt2_tokenizer.jl
new file mode 100644
index 0000000..4d5cab7
--- /dev/null
+++ b/test/gpt2_tokenizer.jl
@@ -0,0 +1,25 @@
+using WordTokenizers
+using Test
+
+tokenizer = load(GPT2)
+
+@testset "Pretrained" begin
+    @test typeof(tokenizer) == WordTokenizers.GPT2Tokenizer
+    @test typeof(tokenizer.vocab) == Dict{String, Any}
+    @test typeof(tokenizer.rank) == Dict{Pair{String,String}, Int}
+    @test typeof(tokenizer.cache) == Dict{String, Tuple}
+    @test typeof(WordTokenizers.pretrained) == Dict{DataType,Array{String,1}}
+    @test length(WordTokenizers.pretrained[GPT2]) == 2
+end
+
+@testset "Tokenizer and helper function" begin
+    @test tokenizer.vocab["Hi"] == 17250
+    @test tokenize("I love julia language", tokenizer) == ["I",
+                                                           "Ġlove",
+                                                           "Ġj",
+                                                           "ulia",
+                                                           "Ġlanguage"]
+    tokens = tokenize("I love julia language", tokenizer)
+    @test ids_from_tokens(tokens, tokenizer) == [40, 1842, 474, 43640, 3303]
+    @test sentence_from_tokens_gpt2(tokens) == "I love julia language"
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 10bb818..0b789ff 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -8,6 +8,7 @@ files = ["simple",
          "tweet_tokenize",
          "reversible_tok",
          "toktok",
+         "gpt2_tokenizer",
          "sp_unigram"
         ]
 

From de91b91ec00c38ab7c89d57f25742c1a30e71ffd Mon Sep 17 00:00:00 2001
From: Shikhar Goswami <shikhargoswami2308@gmail.com>
Date: Thu, 18 Mar 2021 13:10:54 +0530
Subject: [PATCH 3/7] Fixedloading issue

---
 src/WordTokenizers.jl            |  7 ++++++
 src/statistical/gpt2tokenizer.jl |  4 ++--
 src/statistical/unigram.jl       | 38 ++++++++++++++++----------------
 test/gpt2_tokenizer.jl           | 18 +++++++--------
 test/runtests.jl                 |  4 ++--
 5 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl
index fb54bb3..bbe84a7 100644
--- a/src/WordTokenizers.jl
+++ b/src/WordTokenizers.jl
@@ -48,4 +48,11 @@ function __init__()
     init_vocab_datadeps()
 end
 
+load(::Val{:ALBERT_V1}) = load_sp(ALBERT_V1)
+load(::Val{:ALBERT_V2}) = load_sp(ALBERT_V2)
+load(::Val{:GPT2}) = load_gpt2(GPT2)
+
+load(::Type{T}) where T<:PretrainedTokenizer = load(Val(Symbol(T)))
+
+
 end # module
diff --git a/src/statistical/gpt2tokenizer.jl b/src/statistical/gpt2tokenizer.jl
index 5510c3c..8e93dd4 100644
--- a/src/statistical/gpt2tokenizer.jl
+++ b/src/statistical/gpt2tokenizer.jl
@@ -39,7 +39,7 @@ struct GPT2Tokenizer
 end
 
 """
-load(ty::Type{T}) where T<:PretrainedTokenizer
+load_gpt2(ty::Type{T}) where T<:PretrainedTokenizer
 Initializes the GPT2Tokenizer and loads the vocab and merges files from `DataDeps`
 #Example
 ```julia-repl
@@ -47,7 +47,7 @@ julia> tokenizer = load(GPT2)
 
 ```
 """
-function load(ty::Type{T}) where T<:PretrainedTokenizer
+function load_gpt2(::Type{T}) where T<:PretrainedTokenizer
     GPT2Tokenizer(T)
 end
 
diff --git a/src/statistical/unigram.jl b/src/statistical/unigram.jl
index 2901b1d..b3808d3 100644
--- a/src/statistical/unigram.jl
+++ b/src/statistical/unigram.jl
@@ -3,7 +3,7 @@ struct SentencePieceModel
   vocab_map::Dict{String, Tuple{Float64, Int}}
   unk_id::Int
 end
-structure, To hold unknown token index and map of vocabulary to log probability and index   
+structure, To hold unknown token index and map of vocabulary to log probability and index
 """
 struct SentencePieceModel
     vocab_map::Dict{String, Tuple{Float64, Int}}
@@ -11,25 +11,25 @@ struct SentencePieceModel
 end
 
 """
-    load(ty::Type{T}, filenum::Int=1; unk_token="<unk>") where T<:PretrainedTokenizer
+    load_sp(ty::Type{T}, filenum::Int=1; unk_token="<unk>") where T<:PretrainedTokenizer
 use to initialize the `SentencePieceModel` by loading the file from `DataDeps`
 # Example
 ```julia-repl
 julia> spm = load(ALBERT_V1)
 ```
 """
-function load(ty::Type{T}, filenum::Int=1; unk_token="<unk>") where T<:PretrainedTokenizer
+function load_sp(ty::Type{T}, filenum::Int=1; unk_token="<unk>") where T<:PretrainedTokenizer
     filepath = @datadep_str tokenizer_files(ty)[filenum]
     name = tokenizer_files(ty)[filenum]
     filepath = "$filepath/$name"
-    load(filepath, unk_token=unk_token)  
+    load_sp(filepath, unk_token=unk_token)
 end
 
 """
-    load(path; unk_token="<unk>") 
+    load_sp(path; unk_token="<unk>")
 use to initialize the SentencePieceModel by providing `vocab filepath`
-"""    
-function load(path; unk_token="<unk>")
+"""
+function load_sp(path; unk_token="<unk>")
     vocab_path = readlines(path)
     vocabnlogp = split.(vocab_path, "\t")
     vocab_map = Dict(tok=>(parse(Float64, logp), index) for (index, (tok, logp)) in enumerate(vocabnlogp))
@@ -37,13 +37,13 @@ function load(path; unk_token="<unk>")
         unk_id = vocab_map[unk_token][2]
     else
         throw(DomainError(unk_token, "Unknown token is not in the vocabulary"))
-    end 
+    end
     spm = SentencePieceModel(vocab_map, unk_id)
     return spm
 end
 
 """
-struct Nodes 
+struct Nodes
     text::String
     score::Float32
     index::Int64
@@ -51,9 +51,9 @@ struct Nodes
     en::Int
 end
 Utility structure, To hold the results of the `forward pass` (the forward Viterbi lattice)
-hold the token token string, score, vocabulary index, start and end character position   
+hold the token token string, score, vocabulary index, start and end character position
 """
-struct Nodes 
+struct Nodes
     text::String
     score::Float32
     index::Int64
@@ -90,10 +90,10 @@ julia> node = WordTokenizers.decode_forward(spm, "I love julia language")
  WordTokenizers.Nodes("gua", -23.776f0, 15259, 17, 19)
  WordTokenizers.Nodes("ag", -34.1531f0, 3303, 19, 20)
  WordTokenizers.Nodes("language", -11.1965f0, 7021, 14, 21)
-``` 
+```
 """
 function decode_forward(sp::SentencePieceModel, text::String)
-    results = Array{Nodes, 1}(undef, lastindex(text)) 
+    results = Array{Nodes, 1}(undef, lastindex(text))
     scores = fill(-Inf, lastindex(text))
     scores[1] = 0
     for char_end in eachindex(text)
@@ -103,7 +103,7 @@ function decode_forward(sp::SentencePieceModel, text::String)
             if haskey(sp.vocab_map, subtoken)
                 subtokenid =  sp.vocab_map[subtoken][2]
                 local_score = scores[char_start] + sp.vocab_map[subtoken][1]
-                if local_score > scores[char_end]   
+                if local_score > scores[char_end]
                     results[char_end] = Nodes(SubString(text, char_start:char_end), local_score, subtokenid, char_start, char_end)
                     scores[char_end] = local_score
                 end
@@ -141,7 +141,7 @@ julia> WordTokenizers.decode_backward(spm, node, text)
 function decode_backward(sp::SentencePieceModel, nodes::Array{Nodes,1}, text::AbstractString)
     next_nodes = nodes[end]
     best_seq = Nodes[]
-    
+
     while next_nodes.start > 1
         node_value = next_nodes
         next_nodes = nodes[prevind(text, node_value.start)]
@@ -166,7 +166,7 @@ function tokenizer(sp::SentencePieceModel, text::AbstractString)
     tokens = reverse(tokens)
     tks = [node.text for node in tokens]
     return tks
-    
+
 end
 
 """
@@ -180,8 +180,8 @@ end
 """
     ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1})
 given tokens it provide its indices
-"""     
-function ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1})  
+"""
+function ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1})
     map(tk) do x
         last(get(spm.vocab_map, x, spm.unk_id))
     end
@@ -195,5 +195,5 @@ function sentence_from_tokens(tk::Array{String,1})
     sen = join(tk)
     sen = replace(sen, "▁" => " ")
     sen = strip(sen)
-    return sen     
+    return sen
 end
diff --git a/test/gpt2_tokenizer.jl b/test/gpt2_tokenizer.jl
index 4d5cab7..17b02b6 100644
--- a/test/gpt2_tokenizer.jl
+++ b/test/gpt2_tokenizer.jl
@@ -1,25 +1,25 @@
 using WordTokenizers
 using Test
 
-tokenizer = load(GPT2)
+gpt2_tokenizer = load(GPT2)
 
 @testset "Pretrained" begin
-    @test typeof(tokenizer) == WordTokenizers.GPT2Tokenizer
-    @test typeof(tokenizer.vocab) == Dict{String, Any}
-    @test typeof(tokenizer.rank) == Dict{Pair{String,String}, Int}
-    @test typeof(tokenizer.cache) == Dict{String, Tuple}
+    @test typeof(gpt2_tokenizer) == WordTokenizers.GPT2Tokenizer
+    @test typeof(gpt2_tokenizer.vocab) == Dict{String, Any}
+    @test typeof(gpt2_tokenizer.rank) == Dict{Pair{String,String}, Int}
+    @test typeof(gpt2_tokenizer.cache) == Dict{String, Tuple}
     @test typeof(WordTokenizers.pretrained) == Dict{DataType,Array{String,1}}
     @test length(WordTokenizers.pretrained[GPT2]) == 2
 end
 
 @testset "Tokenizer and helper function" begin
-    @test tokenizer.vocab["Hi"] == 17250
-    @test tokenize("I love julia language", tokenizer) == ["I",
+    @test gpt2_tokenizer.vocab["Hi"] == 17250
+    @test tokenize("I love julia language", gpt2_tokenizer) == ["I",
                                                            "Ġlove",
                                                            "Ġj",
                                                            "ulia",
                                                            "Ġlanguage"]
-    tokens = tokenize("I love julia language", tokenizer)
-    @test ids_from_tokens(tokens, tokenizer) == [40, 1842, 474, 43640, 3303]
+    tokens = tokenize("I love julia language", gpt2_tokenizer)
+    @test ids_from_tokens(tokens, gpt2_tokenizer) == [40, 1842, 474, 43640, 3303]
     @test sentence_from_tokens_gpt2(tokens) == "I love julia language"
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 0b789ff..588dfad 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -8,8 +8,8 @@ files = ["simple",
          "tweet_tokenize",
          "reversible_tok",
          "toktok",
-         "gpt2_tokenizer",
-         "sp_unigram"
+         "sp_unigram",
+         "gpt2_tokenizer"
         ]
 
 @testset "$file" for file in files

From 1d90ed235668fcca73d9d213f2f9019d4f48d1fb Mon Sep 17 00:00:00 2001
From: Shikhar Goswami <shikhargoswami2308@gmail.com>
Date: Fri, 19 Mar 2021 12:53:49 +0530
Subject: [PATCH 4/7] Added more tests and did required changes

---
 .gitignore             |  1 +
 Project.toml           |  2 ++
 src/WordTokenizers.jl  |  4 +++-
 test/gpt2_tokenizer.jl | 17 +++++++++++++++++
 4 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 8c960ec..3f02ca7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.jl.cov
 *.jl.*.cov
 *.jl.mem
+Manifest.toml
diff --git a/Project.toml b/Project.toml
index 0e55d74..cf375d3 100644
--- a/Project.toml
+++ b/Project.toml
@@ -15,6 +15,8 @@ DataDeps = "0.6.5, 0.7"
 HTML_Entities = "1"
 StrTables = "1"
 julia = "1"
+JSON = "0.21.1"
+InternedStrings = "0.7.0"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl
index bbe84a7..e833d63 100644
--- a/src/WordTokenizers.jl
+++ b/src/WordTokenizers.jl
@@ -17,7 +17,9 @@ export poormans_tokenize, punctuation_space_tokenize,
        set_tokenizer, set_sentence_splitter,
        rev_tokenize, rev_detokenize,
        toktok_tokenize
-export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens, GPT2, GPT2Tokenizer, tokenize, sentence_from_tokens_gpt2
+
+export ALBERT_V1, ALBERT_V2, GPT2
+export load, tokenizer, sentence_from_tokens, ids_from_tokens, tokenize, sentence_from_tokens_gpt2
 export PretrainedTokenizer, tokenizer_files
 include("words/fast.jl")
 
diff --git a/test/gpt2_tokenizer.jl b/test/gpt2_tokenizer.jl
index 17b02b6..9f0a6c6 100644
--- a/test/gpt2_tokenizer.jl
+++ b/test/gpt2_tokenizer.jl
@@ -22,4 +22,21 @@ end
     tokens = tokenize("I love julia language", gpt2_tokenizer)
     @test ids_from_tokens(tokens, gpt2_tokenizer) == [40, 1842, 474, 43640, 3303]
     @test sentence_from_tokens_gpt2(tokens) == "I love julia language"
+
+    tokens= tokenize("A census taker once tried to test me. I ate his liver with some fava beans and a nice Chianti.", gpt2_tokenizer)
+    @test tokens  == ["A", "Ġcensus", "Ġt", "aker", "Ġonce",
+                     "Ġtried", "Ġto", "Ġtest", "Ġme", ".",
+                     "ĠI", "Ġate", "Ġhis", "Ġliver", "Ġwith",
+                     "Ġsome", "Ġfav", "a","Ġbeans", "Ġand",
+                     "Ġa", "Ġnice", "ĠCh", "iant", "i", "."]
+    @test ids_from_tokens(tokens, gpt2_tokenizer) == [32, 21649, 256, 3110, 1752, 3088, 284, 1332, 502, 13, 314, 15063,
+                                      465, 14383, 351, 617, 2090, 64, 16567, 290, 257, 3621, 609, 3014,
+                                      72, 13]
+
+   text = "Badges? We ain't got no badges:) We don't need no badges:p I don't have to show you any stinking badges!"
+   tokens = tokenize(text, gpt2_tokenizer)
+   @test tokens == ["Bad", "ges", "?", "ĠWe", "Ġain", "'t", "Ġgot", "Ġno", "Ġbadges", ":", ")", "ĠWe",
+                    "Ġdon", "'t", "Ġneed", "Ġno", "Ġbadges", ":", "p", "ĠI", "Ġdon", "'t", "Ġhave",
+                    "Ġto", "Ġshow", "Ġyou", "Ġany", "Ġst", "inking", "Ġbadges", "!"]
+   @test sentence_from_tokens_gpt2(tokens) == text
 end

From 6f2f448535461c1c1844ff900a50c2514fff5bf6 Mon Sep 17 00:00:00 2001
From: Shikhar Goswami <shikhargoswami2308@gmail.com>
Date: Mon, 22 Mar 2021 13:01:17 +0530
Subject: [PATCH 5/7] Standardised API to match with existing one

---
 Project.toml                     |  2 +-
 src/WordTokenizers.jl            |  2 +-
 src/statistical/gpt2tokenizer.jl | 16 ++++++++--------
 src/statistical/unigram.jl       | 10 +++++-----
 test/gpt2_tokenizer.jl           | 16 ++++++++--------
 test/sp_unigram.jl               | 12 ++++++------
 6 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/Project.toml b/Project.toml
index cf375d3..526c3ec 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,7 +14,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 DataDeps = "0.6.5, 0.7"
 HTML_Entities = "1"
 StrTables = "1"
-julia = "1"
+julia = "1, 1.1"
 JSON = "0.21.1"
 InternedStrings = "0.7.0"
 
diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl
index e833d63..70b46aa 100644
--- a/src/WordTokenizers.jl
+++ b/src/WordTokenizers.jl
@@ -19,7 +19,7 @@ export poormans_tokenize, punctuation_space_tokenize,
        toktok_tokenize
 
 export ALBERT_V1, ALBERT_V2, GPT2
-export load, tokenizer, sentence_from_tokens, ids_from_tokens, tokenize, sentence_from_tokens_gpt2
+export load, tokenize, sentence_from_tokens, ids_from_tokens
 export PretrainedTokenizer, tokenizer_files
 include("words/fast.jl")
 
diff --git a/src/statistical/gpt2tokenizer.jl b/src/statistical/gpt2tokenizer.jl
index 8e93dd4..6732311 100644
--- a/src/statistical/gpt2tokenizer.jl
+++ b/src/statistical/gpt2tokenizer.jl
@@ -148,7 +148,7 @@ function bpe(token::String, tokenizer::GPT2Tokenizer)
 end
 
 """
-tokenize(text::String, tokenizer::GPT2Tokenizer)
+tokenize(tokenizer::GPT2Tokenizer, text::String)
 Implements tokenization of input text. This tokenizer doesn't include unknown and special tokens because
 of its byte-level BPE tokenization. GPT2 model is only trained on end token `<|endoftext|>`. Has to be
 manually added after the tokenization.
@@ -156,7 +156,7 @@ GPT2 Tokenizer treats whitespace as unicode character `\u0120 (Ġ)` before a wor
 
 # Example
 ```julia-repl
-julia> tokens = tokenize("Hi! How you doin", tokenizer)
+julia> tokens = tokenize(tokenizer, "Hi! How you doin")
 6-element Array{String,1}:
  "Hi"
  "!"
@@ -166,7 +166,7 @@ julia> tokens = tokenize("Hi! How you doin", tokenizer)
  "in"
 ```
 """
-function tokenize(text::String, tokenizer::GPT2Tokenizer)
+function tokenize(tokenizer::GPT2Tokenizer, text::String)
     mapping = bytes_to_unicode()
     tokens=Vector{String}()
     matches = map(eachmatch(tokenizer.pat, text)) do m
@@ -180,7 +180,7 @@ function tokenize(text::String, tokenizer::GPT2Tokenizer)
 end
 
 """
-ids_from_tokens(tokens::Vector{String}, tokenizer::GPT2Tokenizer)
+ids_from_tokens(tokenizer::GPT2Tokenizer, tokens::Vector{String})
 Returns respective ids of tokens from pretrained vocabulary map
 
 # Example
@@ -194,7 +194,7 @@ julia> tokens = tokenize("Hi! How you doin", tokenizer)
  "Ġdo"
  "in"
 
-julia> ids_from_tokens(tokens, tokenizer)
+julia> ids_from_tokens(tokenizer, tokens)
 6-element Array{Int64,1}:
  17250
      0
@@ -204,14 +204,14 @@ julia> ids_from_tokens(tokens, tokenizer)
    259
 ```
 """
-function ids_from_tokens(tokens::Vector{String}, tokenizer::GPT2Tokenizer)
+function ids_from_tokens(tokenizer::GPT2Tokenizer, tokens::Vector{String})
     map(tokens) do x
         last(get(tokenizer.vocab, x, 0))
     end
 end
 
-function sentence_from_tokens_gpt2(tk::Array{String,1})
-    sen = join(tk)
+function sentence_from_tokens(tokenizer::GPT2Tokenizer, tokens::Array{String,1})
+    sen = join(tokens)
     sen = replace(sen, "Ġ" => " ")
     sen = strip(sen)
     return sen
diff --git a/src/statistical/unigram.jl b/src/statistical/unigram.jl
index b3808d3..dd4df0d 100644
--- a/src/statistical/unigram.jl
+++ b/src/statistical/unigram.jl
@@ -152,11 +152,11 @@ function decode_backward(sp::SentencePieceModel, nodes::Array{Nodes,1}, text::Ab
 end
 
 """
-    tokenizer(sp::SentencePieceModel,text::AbstractString)
+    tokenize(sp::SentencePieceModel,text::AbstractString)
 It does all the preprocessing step needed and perform `decode_forward` and `decode_backward`
 ouput tokenize tokens as Array{String,1}
 """
-function tokenizer(sp::SentencePieceModel, text::AbstractString)
+function tokenize(sp::SentencePieceModel, text::AbstractString)
     text = replace(text, " " => "▁")
     if text[1] != '▁'
         text = "▁" * text
@@ -174,7 +174,7 @@ end
 It does all the preprocessing step needed and perform `decode_forward` and `decode_backward`.
 """
 function (sp::SentencePieceModel)(text::AbstractString)
-    tokenizer(sp, text)
+    tokenize(sp, text)
 end
 
 """
@@ -188,10 +188,10 @@ function ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1})
 end
 
 """
-    sentence_from_tokens(tk::Array{String,1})
+    sentence_from_tokens(spm::SentencePieceModel, tk::Array{String,1})
 given tokens it provide its sentences
 """
-function sentence_from_tokens(tk::Array{String,1})
+function sentence_from_tokens(spm::SentencePieceModel, tk::Array{String,1})
     sen = join(tk)
     sen = replace(sen, "▁" => " ")
     sen = strip(sen)
diff --git a/test/gpt2_tokenizer.jl b/test/gpt2_tokenizer.jl
index 9f0a6c6..195daae 100644
--- a/test/gpt2_tokenizer.jl
+++ b/test/gpt2_tokenizer.jl
@@ -14,29 +14,29 @@ end
 
 @testset "Tokenizer and helper function" begin
     @test gpt2_tokenizer.vocab["Hi"] == 17250
-    @test tokenize("I love julia language", gpt2_tokenizer) == ["I",
+    @test tokenize(gpt2_tokenizer, "I love julia language") == ["I",
                                                            "Ġlove",
                                                            "Ġj",
                                                            "ulia",
                                                            "Ġlanguage"]
-    tokens = tokenize("I love julia language", gpt2_tokenizer)
-    @test ids_from_tokens(tokens, gpt2_tokenizer) == [40, 1842, 474, 43640, 3303]
-    @test sentence_from_tokens_gpt2(tokens) == "I love julia language"
+    tokens = tokenize(gpt2_tokenizer, "I love julia language")
+    @test ids_from_tokens(gpt2_tokenizer, tokens) == [40, 1842, 474, 43640, 3303]
+    @test sentence_from_tokens(gpt2_tokenizer, tokens) == "I love julia language"
 
-    tokens= tokenize("A census taker once tried to test me. I ate his liver with some fava beans and a nice Chianti.", gpt2_tokenizer)
+    tokens= tokenize(gpt2_tokenizer, "A census taker once tried to test me. I ate his liver with some fava beans and a nice Chianti.")
     @test tokens  == ["A", "Ġcensus", "Ġt", "aker", "Ġonce",
                      "Ġtried", "Ġto", "Ġtest", "Ġme", ".",
                      "ĠI", "Ġate", "Ġhis", "Ġliver", "Ġwith",
                      "Ġsome", "Ġfav", "a","Ġbeans", "Ġand",
                      "Ġa", "Ġnice", "ĠCh", "iant", "i", "."]
-    @test ids_from_tokens(tokens, gpt2_tokenizer) == [32, 21649, 256, 3110, 1752, 3088, 284, 1332, 502, 13, 314, 15063,
+    @test ids_from_tokens(gpt2_tokenizer, tokens) == [32, 21649, 256, 3110, 1752, 3088, 284, 1332, 502, 13, 314, 15063,
                                       465, 14383, 351, 617, 2090, 64, 16567, 290, 257, 3621, 609, 3014,
                                       72, 13]
 
    text = "Badges? We ain't got no badges:) We don't need no badges:p I don't have to show you any stinking badges!"
-   tokens = tokenize(text, gpt2_tokenizer)
+   tokens = tokenize(gpt2_tokenizer, text)
    @test tokens == ["Bad", "ges", "?", "ĠWe", "Ġain", "'t", "Ġgot", "Ġno", "Ġbadges", ":", ")", "ĠWe",
                     "Ġdon", "'t", "Ġneed", "Ġno", "Ġbadges", ":", "p", "ĠI", "Ġdon", "'t", "Ġhave",
                     "Ġto", "Ġshow", "Ġyou", "Ġany", "Ġst", "inking", "Ġbadges", "!"]
-   @test sentence_from_tokens_gpt2(tokens) == text
+   @test sentence_from_tokens(gpt2_tokenizer, tokens) == text
 end
diff --git a/test/sp_unigram.jl b/test/sp_unigram.jl
index f85686e..416ca1e 100644
--- a/test/sp_unigram.jl
+++ b/test/sp_unigram.jl
@@ -16,12 +16,12 @@ end
 end
 @testset "Tokenizers and helper function" begin
     @test spm.vocab_map["now"][2] == 1388
-    @test tokenizer(spm, "I love julia language") == ["▁",        
-                                                      "I",        
-                                                      "▁love",    
-                                                      "▁julia",   
+    @test tokenize(spm, "I love julia language") == ["▁",
+                                                      "I",
+                                                      "▁love",
+                                                      "▁julia",
                                                       "▁language"]
-    tks = tokenizer(spm, "i love julia language")
+    tks = tokenize(spm, "i love julia language")
     @test ids_from_tokens(spm, tks) == [32, 340, 5424, 817]
-    @test sentence_from_tokens(tks) == "i love julia language"
+    @test sentence_from_tokens(spm, tks) == "i love julia language"
 end

From 6d2e8a9e5dd9fe82a3aeadbc927a5178b2a4de71 Mon Sep 17 00:00:00 2001
From: Shikhar Goswami <shikhargoswami2308@gmail.com>
Date: Mon, 22 Mar 2021 13:25:36 +0530
Subject: [PATCH 6/7] Corrected and Modified README

---
 README.md | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 148f666..b394a81 100644
--- a/README.md
+++ b/README.md
@@ -294,42 +294,50 @@ julia> tokenize("hi__hello")
  "__"
  "hihello"
 ```
-# Statistical Tokenizer 
+# Statistical Tokenizer
 
-**Sentencepiece Unigram Encoder** is basically  the Sentencepiece processor's re-implementation in julia. It can used vocab file generated by sentencepiece library containing both vocab and log probability.
+ - **Sentencepiece Unigram Encoder** is basically the Sentencepiece processor's re-implementation in julia. It can used vocab file generated by sentencepiece library containing both vocab and log probability.
+ For more detail about implementation refer the blog post [here](https://tejasvaidhyadev.github.io/blog/Sentencepiece)
 
-For more detail about implementation refer the blog post [here](https://tejasvaidhyadev.github.io/blog/Sentencepiece)
+ - **GPT2 Tokenizer** is the subword tokenizer which uses Byte Level Pair Encoding to split unknown words into known subwords present in it's pretrained vocabulary.
 
 **Note** :
 
 - SentencePiece escapes the whitespace with a meta symbol "▁" (U+2581).
+- GPT2Tokenizer treats whitespace before a word as part of the word and escapes it with meta symbol "Ġ" (U+0120).
 
 
-### Pretrained 
+### Pretrained
 
-Wordtokenizer provides pretrained vocab file of Albert (both version-1 and version-2) 
+Wordtokenizers provides pretrained vocab file of Albert (both version-1 and version-2) and GPT2. You can initialize the tokenizers by load function.
 
 ```julia
 julia> subtypes(PretrainedTokenizer)
 2-element Array{Any,1}:
  ALBERT_V1
  ALBERT_V2
+ GPT2
 
-julia> tokenizerfiles(ALBERT_V1)
+julia> tokenizer_files(ALBERT_V1)
 4-element Array{String,1}:
  "albert_base_v1_30k-clean.vocab"   
  "albert_large_v1_30k-clean.vocab"  
- "albert_xlarge_v1_30k-clean.vocab" 
+ "albert_xlarge_v1_30k-clean.vocab"
  "albert_xxlarge_v1_30k-clean.vocab"
+
+julia> tokenizer_files(GPT2)
+2-element Array{String,1}:
+ "GPT2/encoder.json"
+ "GPT2/vocab.bpe"
 ```
 
 `DataDeps` will handle all the downloading part for us.  You can also create an issue or PR for other pretrained models or directly load by providing path in `load` function
 
 ```julia
-julia> spm = load(Albert_Version1) #loading Default Albert-base vocab in Sentencepiece
+julia> spm = load(ALBERT_V1) #loading Default Albert-base vocab in Sentencepiece
 WordTokenizers.SentencePieceModel(Dict("▁shots"=>(-11.2373, 7281),"▁ordered"=>(-9.84973, 1906),"dev"=>(-12.0915, 14439),"▁silv"=>(-12.6564, 21065),"▁doubtful"=>(-12.7799, 22569),"▁without"=>(-8.34227, 367),"▁pol"=>(-10.7694, 4828),"chem"=>(-12.3713, 17661),"▁1947,"=>(-11.7544, 11199),"▁disrespect"=>(-13.13, 26682)…), 2)
 
-julia> tk = tokenizer(spm, "i love the julia language") #or tk = spm("i love the julia language")
+julia> tk = tokenize(spm, "i love the julia language") #or tk = spm("i love the julia language")
 4-element Array{String,1}:
  "▁i"       
  "▁love"
@@ -337,7 +345,7 @@ julia> tk = tokenizer(spm, "i love the julia language") #or tk = spm("i love the
  "▁julia"   
  "▁language"
 
-julia> subword = tokenizer(spm, "unfriendly")
+julia> subword = tokenize(spm, "unfriendly")
 2-element Array{String,1}:
  "▁un"
  "friendly"
@@ -359,8 +367,8 @@ julia> para = spm("Julia is a high-level, high-performance dynamic language for
  "▁dynamic"   
  "▁language"  
  "▁for"       
- "▁technical" 
- "▁computing" 
+ "▁technical"
+ "▁computing"
 ```
 
 Indices is usually used for deep learning models.
@@ -382,13 +390,13 @@ julia> ids_from_tokens(spm, tk)
  5424
   817
 #we can also get sentences back from tokens
-julia> sentence_from_tokens(tk)
+julia> sentence_from_tokens(spm, tk)
  "i love the julia language"
 
-julia> sentence_from_token(subword)
+julia> sentence_from_tokens(spm, subword)
  "unfriendly"
 
-julia> sentence_from_tokens(para)
+julia> sentence_from_tokens(spm, para)
  "Julia is a high-level, high-performance dynamic language for technical computing"
 ```
 

From c03685cde60c129e700ed9c1fe558a5b183154b5 Mon Sep 17 00:00:00 2001
From: Shikhar Goswami <shikhargoswami2308@gmail.com>
Date: Mon, 22 Mar 2021 13:52:05 +0530
Subject: [PATCH 7/7] Deleted Manifest.toml

---
 Manifest.toml | 167 --------------------------------------------------
 1 file changed, 167 deletions(-)
 delete mode 100644 Manifest.toml

diff --git a/Manifest.toml b/Manifest.toml
deleted file mode 100644
index ca5eecd..0000000
--- a/Manifest.toml
+++ /dev/null
@@ -1,167 +0,0 @@
-# This file is machine-generated - editing it directly is not advised
-
-[[Artifacts]]
-deps = ["Pkg"]
-git-tree-sha1 = "c30985d8821e0cd73870b17b0ed0ce6dc44cb744"
-uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
-version = "1.3.0"
-
-[[Base64]]
-uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-
-[[BinaryProvider]]
-deps = ["Libdl", "Logging", "SHA"]
-git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058"
-uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-version = "0.5.10"
-
-[[DataDeps]]
-deps = ["BinaryProvider", "HTTP", "Libdl", "Reexport", "SHA", "p7zip_jll"]
-git-tree-sha1 = "4f0e41ff461d42cfc62ff0de4f1cd44c6e6b3771"
-uuid = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
-version = "0.7.7"
-
-[[Dates]]
-deps = ["Printf"]
-uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
-
-[[Distributed]]
-deps = ["Random", "Serialization", "Sockets"]
-uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
-
-[[HTML_Entities]]
-deps = ["StrTables"]
-git-tree-sha1 = "aa19515d6ebe7f91a39cfc1dc6341f38fcac1282"
-uuid = "7693890a-d069-55fe-a829-b4a6d304f0ee"
-version = "1.0.0"
-
-[[HTTP]]
-deps = ["Base64", "Dates", "IniFile", "MbedTLS", "NetworkOptions", "Sockets", "URIs"]
-git-tree-sha1 = "c9f380c76d8aaa1fa7ea9cf97bddbc0d5b15adc2"
-uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
-version = "0.9.5"
-
-[[IniFile]]
-deps = ["Test"]
-git-tree-sha1 = "098e4d2c533924c921f9f9847274f2ad89e018b8"
-uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f"
-version = "0.5.0"
-
-[[InteractiveUtils]]
-deps = ["Markdown"]
-uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-
-[[InternedStrings]]
-deps = ["Random", "Test"]
-git-tree-sha1 = "eb05b5625bc5d821b8075a77e4c421933e20c76b"
-uuid = "7d512f48-7fb1-5a58-b986-67e6dc259f01"
-version = "0.7.0"
-
-[[JLLWrappers]]
-git-tree-sha1 = "a431f5f2ca3f4feef3bd7a5e94b8b8d4f2f647a0"
-uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
-version = "1.2.0"
-
-[[JSON]]
-deps = ["Dates", "Mmap", "Parsers", "Unicode"]
-git-tree-sha1 = "81690084b6198a2e1da36fcfda16eeca9f9f24e4"
-uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
-version = "0.21.1"
-
-[[LibGit2]]
-deps = ["Printf"]
-uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
-
-[[Libdl]]
-uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
-
-[[Logging]]
-uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
-
-[[Markdown]]
-deps = ["Base64"]
-uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
-
-[[MbedTLS]]
-deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"]
-git-tree-sha1 = "1c38e51c3d08ef2278062ebceade0e46cefc96fe"
-uuid = "739be429-bea8-5141-9913-cc70e7f3736d"
-version = "1.0.3"
-
-[[MbedTLS_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "0eef589dd1c26a3ac9d753fe1a8bcad63f956fa6"
-uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
-version = "2.16.8+1"
-
-[[Mmap]]
-uuid = "a63ad114-7e13-5084-954f-fe012c677804"
-
-[[NetworkOptions]]
-git-tree-sha1 = "ed3157f48a05543cce9b241e1f2815f7e843d96e"
-uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
-version = "1.2.0"
-
-[[Parsers]]
-deps = ["Dates"]
-git-tree-sha1 = "223a825cccef2228f3fdbf2ecc7ca93363059073"
-uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "1.0.16"
-
-[[Pkg]]
-deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
-uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-
-[[Printf]]
-deps = ["Unicode"]
-uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-
-[[REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets"]
-uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
-
-[[Random]]
-deps = ["Serialization"]
-uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-
-[[Reexport]]
-git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5"
-uuid = "189a3867-3050-52da-a836-e630ba90ab69"
-version = "1.0.0"
-
-[[SHA]]
-uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
-
-[[Serialization]]
-uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
-
-[[Sockets]]
-uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
-
-[[StrTables]]
-deps = ["Dates"]
-git-tree-sha1 = "5998faae8c6308acc25c25896562a1e66a3bb038"
-uuid = "9700d1a9-a7c8-5760-9816-a99fda30bb8f"
-version = "1.0.1"
-
-[[Test]]
-deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
-uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[[URIs]]
-git-tree-sha1 = "7855809b88d7b16e9b029afd17880930626f54a2"
-uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
-version = "1.2.0"
-
-[[UUIDs]]
-deps = ["Random", "SHA"]
-uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
-
-[[Unicode]]
-uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
-
-[[p7zip_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "ee65cfa19bea645698a0224bfa216f2b1c8b559f"
-uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
-version = "16.2.0+3"