JuliaText · shikhargoswami · Mar 17, 2021 · Mar 17, 2021 · Mar 18, 2021 · Mar 19, 2021
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 *.jl.cov
 *.jl.*.cov
 *.jl.mem
+Manifest.toml
diff --git a/Project.toml b/Project.toml
@@ -5,14 +5,18 @@ version = "0.5.6"
 [deps]
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 HTML_Entities = "7693890a-d069-55fe-a829-b4a6d304f0ee"
+InternedStrings = "7d512f48-7fb1-5a58-b986-67e6dc259f01"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 StrTables = "9700d1a9-a7c8-5760-9816-a99fda30bb8f"
 Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [compat]
 DataDeps = "0.6.5, 0.7"
-julia = "1"
-HTML_Entities= "1"
+HTML_Entities = "1"
 StrTables = "1"
+julia = "1, 1.1"
+JSON = "0.21.1"
+InternedStrings = "0.7.0"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/README.md b/README.md
@@ -294,50 +294,58 @@ julia> tokenize("hi__hello")
  "__"
  "hihello"
 ```
-# Statistical Tokenizer 
+# Statistical Tokenizer
 
-**Sentencepiece Unigram Encoder** is basically  the Sentencepiece processor's re-implementation in julia. It can used vocab file generated by sentencepiece library containing both vocab and log probability.
+ - **Sentencepiece Unigram Encoder** is basically the Sentencepiece processor's re-implementation in julia. It can used vocab file generated by sentencepiece library containing both vocab and log probability.
+ For more detail about implementation refer the blog post [here](https://tejasvaidhyadev.github.io/blog/Sentencepiece)
 
-For more detail about implementation refer the blog post [here](https://tejasvaidhyadev.github.io/blog/Sentencepiece)
+ - **GPT2 Tokenizer** is the subword tokenizer which uses Byte Level Pair Encoding to split unknown words into known subwords present in it's pretrained vocabulary.
 
 **Note** :
 
 - SentencePiece escapes the whitespace with a meta symbol "▁" (U+2581).
+- GPT2Tokenizer treats whitespace before a word as part of the word and escapes it with meta symbol "Ġ" (U+0120).
 
 
-### Pretrained 
+### Pretrained
 
-Wordtokenizer provides pretrained vocab file of Albert (both version-1 and version-2) 
+Wordtokenizers provides pretrained vocab file of Albert (both version-1 and version-2) and GPT2. You can initialize the tokenizers by load function.
 
 ```julia
 julia> subtypes(PretrainedTokenizer)
 2-element Array{Any,1}:
  ALBERT_V1
  ALBERT_V2
+ GPT2
 
-julia> tokenizerfiles(ALBERT_V1)
+julia> tokenizer_files(ALBERT_V1)
 4-element Array{String,1}:
  "albert_base_v1_30k-clean.vocab"   
  "albert_large_v1_30k-clean.vocab"  
- "albert_xlarge_v1_30k-clean.vocab" 
+ "albert_xlarge_v1_30k-clean.vocab"
  "albert_xxlarge_v1_30k-clean.vocab"
+
+julia> tokenizer_files(GPT2)
+2-element Array{String,1}:
+ "GPT2/encoder.json"
+ "GPT2/vocab.bpe"
 ```
 
 `DataDeps` will handle all the downloading part for us.  You can also create an issue or PR for other pretrained models or directly load by providing path in `load` function
 
 ```julia
-julia> spm = load(Albert_Version1) #loading Default Albert-base vocab in Sentencepiece
+julia> spm = load(ALBERT_V1) #loading Default Albert-base vocab in Sentencepiece
 WordTokenizers.SentencePieceModel(Dict("▁shots"=>(-11.2373, 7281),"▁ordered"=>(-9.84973, 1906),"dev"=>(-12.0915, 14439),"▁silv"=>(-12.6564, 21065),"▁doubtful"=>(-12.7799, 22569),"▁without"=>(-8.34227, 367),"▁pol"=>(-10.7694, 4828),"chem"=>(-12.3713, 17661),"▁1947,"=>(-11.7544, 11199),"▁disrespect"=>(-13.13, 26682)…), 2)
 
-julia> tk = tokenizer(spm, "i love the julia language") #or tk = spm("i love the julia language")
+julia> tk = tokenize(spm, "i love the julia language") #or tk = spm("i love the julia language")
 4-element Array{String,1}:
  "▁i"       
  "▁love"
  "▁the"    
  "▁julia"   
  "▁language"
 
-julia> subword = tokenizer(spm, "unfriendly")
+julia> subword = tokenize(spm, "unfriendly")
 2-element Array{String,1}:
  "▁un"
  "friendly"
@@ -359,8 +367,8 @@ julia> para = spm("Julia is a high-level, high-performance dynamic language for
  "▁dynamic"   
  "▁language"  
  "▁for"       
- "▁technical" 
- "▁computing" 
+ "▁technical"
+ "▁computing"
 ```
 
 Indices is usually used for deep learning models.
@@ -382,13 +390,13 @@ julia> ids_from_tokens(spm, tk)
  5424
   817
 #we can also get sentences back from tokens
-julia> sentence_from_tokens(tk)
+julia> sentence_from_tokens(spm, tk)
  "i love the julia language"
 
-julia> sentence_from_token(subword)
+julia> sentence_from_tokens(spm, subword)
  "unfriendly"
 
-julia> sentence_from_tokens(para)
+julia> sentence_from_tokens(spm, para)
  "Julia is a high-level, high-performance dynamic language for technical computing"
 ```
 

diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl
@@ -4,7 +4,7 @@ module WordTokenizers
 using HTML_Entities
 using StrTables
 using Unicode
-using DataDeps
+using DataDeps, JSON, InternedStrings
 
 abstract type PretrainedTokenizer end
 
@@ -17,7 +17,9 @@ export poormans_tokenize, punctuation_space_tokenize,
        set_tokenizer, set_sentence_splitter,
        rev_tokenize, rev_detokenize,
        toktok_tokenize
-export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens
+
+export ALBERT_V1, ALBERT_V2, GPT2
+export load, tokenize, sentence_from_tokens, ids_from_tokens
 export PretrainedTokenizer, tokenizer_files
 include("words/fast.jl")
 
@@ -33,6 +35,7 @@ include("set_method_api.jl")
 include("split_api.jl")
 
 include("statistical/unigram.jl")
+include("statistical/gpt2tokenizer.jl")
 
 const pretrained = Dict{DataType, Vector{String}}()
 function tokenizer_files(::Type{T}) where T<:PretrainedTokenizer
@@ -47,4 +50,11 @@ function __init__()
     init_vocab_datadeps()
 end
 
+load(::Val{:ALBERT_V1}) = load_sp(ALBERT_V1)
+load(::Val{:ALBERT_V2}) = load_sp(ALBERT_V2)
+load(::Val{:GPT2}) = load_gpt2(GPT2)
+
+load(::Type{T}) where T<:PretrainedTokenizer = load(Val(Symbol(T)))
+
+
 end # module
diff --git a/src/statistical/Vocab_DataDeps.jl b/src/statistical/Vocab_DataDeps.jl
@@ -1,5 +1,6 @@
 abstract type ALBERT_V1 <: PretrainedTokenizer end
 abstract type ALBERT_V2 <: PretrainedTokenizer end
+abstract type GPT2 <: PretrainedTokenizer end
 
 const vectors_albertversion1 = [
     ("albert_base_v1_30k-clean.vocab",
@@ -40,6 +41,8 @@ const vectors_albertversion2 = [
     "https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_xxlarge_v2_30k-clean.vocab")
 ]
 
+const vectors_gpt2 = ["encoder.json", "vocab.bpe"]
+
 function init_vocab_datadeps()
     for (depname, description, sha, link) in vectors_albertversion1
         register(DataDep(depname,
@@ -70,5 +73,17 @@ function init_vocab_datadeps()
                  ))
         append!(tokenizer_files(ALBERT_V2), ["$depname"])
     end
-end
 
+    register(DataDep("GPT2",
+    """
+    Pretrained gpt2 vocabulary and merges file by Open AI.
+    Website: https://openai.com/blog/better-language-models/
+    Author: Radford et al
+    Licence: MIT
+    All GPT2 Models are trained on same size vocabulary.
+    """,
+    ["https://openaipublic.blob.core.windows.net/gpt-2/models/117M/$(file)" for file in vectors_gpt2],
+    "05805f21f823300551adf0646abe905eb036fb272f97c279f0d9c656c845ca46"))
+
+    append!(tokenizer_files(GPT2), ["GPT2/$(file)" for file in vectors_gpt2])
+end