From 86466405e0428df338edb885a87cc0a24634da06 Mon Sep 17 00:00:00 2001 From: Shikhar Goswami Date: Wed, 17 Mar 2021 20:26:47 +0530 Subject: [PATCH 1/7] Adding GPT2 tokenizer to WordEmbeddings --- Manifest.toml | 161 +++++++++++++++++++++++ Project.toml | 5 +- src/WordTokenizers.jl | 5 +- src/statistical/Vocab_DataDeps.jl | 17 ++- src/statistical/gpt2tokenizer.jl | 211 ++++++++++++++++++++++++++++++ 5 files changed, 394 insertions(+), 5 deletions(-) create mode 100644 Manifest.toml create mode 100644 src/statistical/gpt2tokenizer.jl diff --git a/Manifest.toml b/Manifest.toml new file mode 100644 index 0000000..7c27a9c --- /dev/null +++ b/Manifest.toml @@ -0,0 +1,161 @@ +# This file is machine-generated - editing it directly is not advised + +[[Artifacts]] +deps = ["Pkg"] +git-tree-sha1 = "c30985d8821e0cd73870b17b0ed0ce6dc44cb744" +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" +version = "1.3.0" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BinaryProvider]] +deps = ["Libdl", "Logging", "SHA"] +git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" +uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" +version = "0.5.10" + +[[DataDeps]] +deps = ["BinaryProvider", "HTTP", "Libdl", "Reexport", "SHA", "p7zip_jll"] +git-tree-sha1 = "4f0e41ff461d42cfc62ff0de4f1cd44c6e6b3771" +uuid = "124859b0-ceae-595e-8997-d05f6a7a8dfe" +version = "0.7.7" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[HTML_Entities]] +deps = ["StrTables"] +git-tree-sha1 = "aa19515d6ebe7f91a39cfc1dc6341f38fcac1282" +uuid = "7693890a-d069-55fe-a829-b4a6d304f0ee" +version = "1.0.0" + +[[HTTP]] +deps = ["Base64", "Dates", "IniFile", "MbedTLS", "NetworkOptions", "Sockets", "URIs"] +git-tree-sha1 = "c9f380c76d8aaa1fa7ea9cf97bddbc0d5b15adc2" +uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" +version = "0.9.5" + +[[IniFile]] +deps = ["Test"] +git-tree-sha1 = "098e4d2c533924c921f9f9847274f2ad89e018b8" +uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f" +version = "0.5.0" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[JLLWrappers]] +git-tree-sha1 = "a431f5f2ca3f4feef3bd7a5e94b8b8d4f2f647a0" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.2.0" + +[[JSON]] +deps = ["Dates", "Mmap", "Parsers", "Unicode"] +git-tree-sha1 = "81690084b6198a2e1da36fcfda16eeca9f9f24e4" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.21.1" + +[[LibGit2]] +deps = ["Printf"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS]] +deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"] +git-tree-sha1 = "1c38e51c3d08ef2278062ebceade0e46cefc96fe" +uuid = "739be429-bea8-5141-9913-cc70e7f3736d" +version = "1.0.3" + +[[MbedTLS_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "0eef589dd1c26a3ac9d753fe1a8bcad63f956fa6" +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.16.8+1" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[NetworkOptions]] +git-tree-sha1 = "ed3157f48a05543cce9b241e1f2815f7e843d96e" +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" + +[[Parsers]] +deps = ["Dates"] +git-tree-sha1 = "223a825cccef2228f3fdbf2ecc7ca93363059073" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "1.0.16" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Reexport]] +git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "1.0.0" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[StrTables]] +deps = ["Dates"] +git-tree-sha1 = "5998faae8c6308acc25c25896562a1e66a3bb038" +uuid = "9700d1a9-a7c8-5760-9816-a99fda30bb8f" +version = "1.0.1" + +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[URIs]] +git-tree-sha1 = "7855809b88d7b16e9b029afd17880930626f54a2" +uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" +version = "1.2.0" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[p7zip_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "ee65cfa19bea645698a0224bfa216f2b1c8b559f" +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "16.2.0+3" diff --git a/Project.toml b/Project.toml index a81d92f..5b03149 100644 --- a/Project.toml +++ b/Project.toml @@ -5,14 +5,15 @@ version = "0.5.6" [deps] DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" HTML_Entities = "7693890a-d069-55fe-a829-b4a6d304f0ee" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" StrTables = "9700d1a9-a7c8-5760-9816-a99fda30bb8f" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] DataDeps = "0.6.5, 0.7" -julia = "1" -HTML_Entities= "1" +HTML_Entities = "1" StrTables = "1" +julia = "1" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl index e25eb73..248fba5 100644 --- a/src/WordTokenizers.jl +++ b/src/WordTokenizers.jl @@ -4,7 +4,7 @@ module WordTokenizers using HTML_Entities using StrTables using Unicode -using DataDeps +using DataDeps, JSON, InternedStrings abstract type PretrainedTokenizer end @@ -17,7 +17,7 @@ export poormans_tokenize, punctuation_space_tokenize, set_tokenizer, set_sentence_splitter, rev_tokenize, rev_detokenize, toktok_tokenize -export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens +export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens, GPT2, GPT2Tokenizer, tokenize export PretrainedTokenizer, tokenizer_files include("words/fast.jl") @@ -33,6 +33,7 @@ include("set_method_api.jl") include("split_api.jl") include("statistical/unigram.jl") +include("statistical/gpt2tokenizer.jl") const pretrained = Dict{DataType, Vector{String}}() function tokenizer_files(::Type{T}) where T<:PretrainedTokenizer diff --git a/src/statistical/Vocab_DataDeps.jl b/src/statistical/Vocab_DataDeps.jl index d935ba7..ad09dd2 100644 --- a/src/statistical/Vocab_DataDeps.jl +++ b/src/statistical/Vocab_DataDeps.jl @@ -1,5 +1,6 @@ abstract type ALBERT_V1 <: PretrainedTokenizer end abstract type ALBERT_V2 <: PretrainedTokenizer end +abstract type GPT2 <: PretrainedTokenizer end const vectors_albertversion1 = [ ("albert_base_v1_30k-clean.vocab", @@ -40,6 +41,8 @@ const vectors_albertversion2 = [ "https://raw.githubusercontent.com/tejasvaidhyadev/ALBERT.jl/master/src/Vocabs/albert_xxlarge_v2_30k-clean.vocab") ] +const vectors_gpt2 = ["encoder.json", "vocab.bpe"] + function init_vocab_datadeps() for (depname, description, sha, link) in vectors_albertversion1 register(DataDep(depname, @@ -70,5 +73,17 @@ function init_vocab_datadeps() )) append!(tokenizer_files(ALBERT_V2), ["$depname"]) end -end + register(DataDep("GPT2", + """ + Pretrained gpt2 vocabulary and merges file by Open AI. + Website: https://openai.com/blog/better-language-models/ + Author: Radford et al + Licence: MIT + All GPT2 Models are trained on same size vocabulary. + """, + ["https://openaipublic.blob.core.windows.net/gpt-2/models/117M/$(file)" for file in vectors_gpt2], + "05805f21f823300551adf0646abe905eb036fb272f97c279f0d9c656c845ca46")) + + append!(tokenizer_files(GPT2), ["GPT2/$(file)" for file in vectors_gpt2]) +end diff --git a/src/statistical/gpt2tokenizer.jl b/src/statistical/gpt2tokenizer.jl new file mode 100644 index 0000000..52270f6 --- /dev/null +++ b/src/statistical/gpt2tokenizer.jl @@ -0,0 +1,211 @@ +""" +struct GPT2Tokenizer + vocab::Dict{String, Any} + rank::Dict{Pair{String,String}, Int} + cache::Dict{String, Tuple} + pat::Regex +end +structure, To hold pretrained vocabulary map and merge rules for GPT2 +""" +struct GPT2Tokenizer + vocab::Dict{String, Any} + rank::Dict{Pair{String,String}, Int} + cache::Dict{String, Tuple} + pat::Regex + + function GPT2Tokenizer(::Type{T};pat=r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+") where T<:PretrainedTokenizer + + vocab_file = @datadep_str tokenizer_files(T)[1] + bfile = @datadep_str tokenizer_files(T)[2] + + vocab = Dict{String, Any}() + rank = Dict{Pair{String, String}, Int}() + cache = Dict{String, Tuple}() + + vocab = JSON.parsefile(vocab_file) + + open(bfile) do f + for (i, line) ∈ enumerate(eachline(f)) + if i==1 + identity + else + pair = Pair(split(line," ")...) + rank[pair] = i-1 + end + end + end + new(vocab, rank, cache, pat) + end +end + +""" +load(ty::Type{T}) where T<:PretrainedTokenizer +Initializes the GPT2Tokenizer and loads the vocab and merges files from `DataDeps` +#Example +```julia-repl +julia> tokenizer = load(GPT2) +GPT2Tokenizer(Dict{String,Any}("ilet" => 41550,"ĠVer" => 4643,"599" => 43452,"ĠRubin" => 34599,"Ġwrestler" => 34845,"Ġsharp" => 7786,"ĠObst" => 46378,"Ġlover" => 18854,"Core" => 14055,"Ġro" => 686…), Dict(("Ġne" => "ver") => 984,("ĠP" => "helps") => 40332,("Ġrapid" => "ly") => 8647,("s" => "af") => 49330,("Ġsn" => "ack") => 26651,("ra" => "ft") => 1362,("ĠCloud" => "s") => 46043,("Ġbrill" => "iant") => 10202,("Ġconsequ" => "ence") => 12666,("Ġplug" => "in") => 13622…), Dict{String,Tuple}(), r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+") +``` +""" +function load(ty::Type{T}) where T<:PretrainedTokenizer + GPT2Tokenizer(T) +end + +""" +Returns Dictionary of utf-8 encoding and corresponding unicode strings for Byte-Pair Encoding. +""" +function bytes_to_unicode() + bs = [33:255...] + cs = bs[:] + n=0 + for b in 0:255 + if b ∉ bs + append!(bs, b) + append!(cs, 256+n) + n+=1 + end + end + cs = [Char(n) for n in cs] + Dict(zip(bs,cs)) +end + +toStrTuple(x::Vector{String})=toStrTuple(join(x)) +function toStrTuple(x::AbstractString) + fs = intern.(split(chop(x), "")) + push!(fs, intern(x[end]*"")) + filter!((x)-> x != "", fs) + Tuple(fs) +end + +""" +get_pairs(word::NTuple{}) +Returns set of pairs in a word. Word is a tuple of strings. +""" +function get_pairs(word::NTuple{}) + pairs = Set{Pair{}}() + prev_char = word[1] + for char in word[2:end] + push!(pairs, Pair(prev_char, char)) + prev_char = char + end + pairs +end + +lowestpair(pairs::Set{Pair{}},tokenizer::GPT2Tokenizer) = lowestpair(collect(pairs), tokenizer::GPT2Tokenizer) +lowestpair(pairs::Vector{Pair{}}, tokenizer::GPT2Tokenizer) = argmin( + sizehint!(Dict( + map(pairs) do p + p=>get(tokenizer.rank, p, typemax(Int)) + end), + length(pairs)) + ) + + +function bpe(token::String, tokenizer::GPT2Tokenizer) + + haskey(tokenizer.cache, token) && return tokenizer.cache[token] + word = toStrTuple(token) + pairs = get_pairs(word) + isempty(pairs) && return token + + while true + pair = lowestpair(pairs, tokenizer) + !haskey(tokenizer.rank, pair) && break + first, second = pair + new_word=Vector{String}() + i=1 + + while i <= length(word) + + try + j = findnext(isequal(first), word, i) + append!(new_word, word[i:j-1]) + i=j + catch + append!(new_word,word[i:end]) + break + end + + if word[i]==first && i<=length(word)-1 && word[i+1]==second + push!(new_word, first*second) + i+=2 + else + push!(new_word, word[i]) + i+=1 + end + end + new_word = Tuple(new_word) + word = new_word + + if length(word)==1 + break + else + pairs = get_pairs(word) + end + end + tokenizer.cache[token] = word + word +end + +""" +tokenize(text::String, tokenizer::GPT2Tokenizer) +Implements tokenization of input text. This tokenizer doesn't include unknown and special tokens because +of its byte-level BPE tokenization. GPT2 model is only trained on end token `<|endoftext|>`. Has to be +manually added after the tokenization. +GPT2 Tokenizer treats whitespace as unicode character `\u0120 (Ġ)` before a word. + +# Example +```julia-repl +julia> tokens = tokenize("Hi! How you doin", tokenizer) +6-element Array{String,1}: + "Hi" + "!" + "ĠHow" + "Ġyou" + "Ġdo" + "in" +``` +""" +function tokenize(text::String, tokenizer::GPT2Tokenizer) + mapping = bytes_to_unicode() + tokens=Vector{String}() + matches = map(eachmatch(tokenizer.pat, text)) do m + m.match + end + for token in matches + token = join([mapping[Int(b)] for b in token]) + append!(tokens, [string(bpe_token) for bpe_token in bpe(token, tokenizer)]) + end + tokens +end + +""" +ids_from_tokens(tokens::Vector{String}, tokenizer::GPT2Tokenizer) +Returns respective ids of tokens from pretrained vocabulary map + +# Example +```julia-repl +julia> tokens = tokenize("Hi! How you doin", tokenizer) +6-element Array{String,1}: + "Hi" + "!" + "ĠHow" + "Ġyou" + "Ġdo" + "in" + +julia> ids_from_tokens(tokens, tokenizer) +6-element Array{Int64,1}: + 17250 + 0 + 1374 + 345 + 466 + 259 +``` +""" +function ids_from_tokens(tokens::Vector{String}, tokenizer::GPT2Tokenizer) + map(tokens) do x + last(get(tokenizer.vocab, x, 0)) + end +end From 179c517ca15756e0a565511274f5f85244f60fa8 Mon Sep 17 00:00:00 2001 From: Shikhar Goswami Date: Wed, 17 Mar 2021 21:12:11 +0530 Subject: [PATCH 2/7] Added tests --- Manifest.toml | 6 ++++++ Project.toml | 1 + src/WordTokenizers.jl | 2 +- src/statistical/gpt2tokenizer.jl | 9 ++++++++- test/gpt2_tokenizer.jl | 25 +++++++++++++++++++++++++ test/runtests.jl | 1 + 6 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 test/gpt2_tokenizer.jl diff --git a/Manifest.toml b/Manifest.toml index 7c27a9c..ca5eecd 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -51,6 +51,12 @@ version = "0.5.0" deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +[[InternedStrings]] +deps = ["Random", "Test"] +git-tree-sha1 = "eb05b5625bc5d821b8075a77e4c421933e20c76b" +uuid = "7d512f48-7fb1-5a58-b986-67e6dc259f01" +version = "0.7.0" + [[JLLWrappers]] git-tree-sha1 = "a431f5f2ca3f4feef3bd7a5e94b8b8d4f2f647a0" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" diff --git a/Project.toml b/Project.toml index 5b03149..0e55d74 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.5.6" [deps] DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" HTML_Entities = "7693890a-d069-55fe-a829-b4a6d304f0ee" +InternedStrings = "7d512f48-7fb1-5a58-b986-67e6dc259f01" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" StrTables = "9700d1a9-a7c8-5760-9816-a99fda30bb8f" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl index 248fba5..fb54bb3 100644 --- a/src/WordTokenizers.jl +++ b/src/WordTokenizers.jl @@ -17,7 +17,7 @@ export poormans_tokenize, punctuation_space_tokenize, set_tokenizer, set_sentence_splitter, rev_tokenize, rev_detokenize, toktok_tokenize -export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens, GPT2, GPT2Tokenizer, tokenize +export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens, GPT2, GPT2Tokenizer, tokenize, sentence_from_tokens_gpt2 export PretrainedTokenizer, tokenizer_files include("words/fast.jl") diff --git a/src/statistical/gpt2tokenizer.jl b/src/statistical/gpt2tokenizer.jl index 52270f6..5510c3c 100644 --- a/src/statistical/gpt2tokenizer.jl +++ b/src/statistical/gpt2tokenizer.jl @@ -44,7 +44,7 @@ Initializes the GPT2Tokenizer and loads the vocab and merges files from `DataDep #Example ```julia-repl julia> tokenizer = load(GPT2) -GPT2Tokenizer(Dict{String,Any}("ilet" => 41550,"ĠVer" => 4643,"599" => 43452,"ĠRubin" => 34599,"Ġwrestler" => 34845,"Ġsharp" => 7786,"ĠObst" => 46378,"Ġlover" => 18854,"Core" => 14055,"Ġro" => 686…), Dict(("Ġne" => "ver") => 984,("ĠP" => "helps") => 40332,("Ġrapid" => "ly") => 8647,("s" => "af") => 49330,("Ġsn" => "ack") => 26651,("ra" => "ft") => 1362,("ĠCloud" => "s") => 46043,("Ġbrill" => "iant") => 10202,("Ġconsequ" => "ence") => 12666,("Ġplug" => "in") => 13622…), Dict{String,Tuple}(), r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+") + ``` """ function load(ty::Type{T}) where T<:PretrainedTokenizer @@ -209,3 +209,10 @@ function ids_from_tokens(tokens::Vector{String}, tokenizer::GPT2Tokenizer) last(get(tokenizer.vocab, x, 0)) end end + +function sentence_from_tokens_gpt2(tk::Array{String,1}) + sen = join(tk) + sen = replace(sen, "Ġ" => " ") + sen = strip(sen) + return sen +end diff --git a/test/gpt2_tokenizer.jl b/test/gpt2_tokenizer.jl new file mode 100644 index 0000000..4d5cab7 --- /dev/null +++ b/test/gpt2_tokenizer.jl @@ -0,0 +1,25 @@ +using WordTokenizers +using Test + +tokenizer = load(GPT2) + +@testset "Pretrained" begin + @test typeof(tokenizer) == WordTokenizers.GPT2Tokenizer + @test typeof(tokenizer.vocab) == Dict{String, Any} + @test typeof(tokenizer.rank) == Dict{Pair{String,String}, Int} + @test typeof(tokenizer.cache) == Dict{String, Tuple} + @test typeof(WordTokenizers.pretrained) == Dict{DataType,Array{String,1}} + @test length(WordTokenizers.pretrained[GPT2]) == 2 +end + +@testset "Tokenizer and helper function" begin + @test tokenizer.vocab["Hi"] == 17250 + @test tokenize("I love julia language", tokenizer) == ["I", + "Ġlove", + "Ġj", + "ulia", + "Ġlanguage"] + tokens = tokenize("I love julia language", tokenizer) + @test ids_from_tokens(tokens, tokenizer) == [40, 1842, 474, 43640, 3303] + @test sentence_from_tokens_gpt2(tokens) == "I love julia language" +end diff --git a/test/runtests.jl b/test/runtests.jl index 10bb818..0b789ff 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,6 +8,7 @@ files = ["simple", "tweet_tokenize", "reversible_tok", "toktok", + "gpt2_tokenizer", "sp_unigram" ] From de91b91ec00c38ab7c89d57f25742c1a30e71ffd Mon Sep 17 00:00:00 2001 From: Shikhar Goswami Date: Thu, 18 Mar 2021 13:10:54 +0530 Subject: [PATCH 3/7] Fixedloading issue --- src/WordTokenizers.jl | 7 ++++++ src/statistical/gpt2tokenizer.jl | 4 ++-- src/statistical/unigram.jl | 38 ++++++++++++++++---------------- test/gpt2_tokenizer.jl | 18 +++++++-------- test/runtests.jl | 4 ++-- 5 files changed, 39 insertions(+), 32 deletions(-) diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl index fb54bb3..bbe84a7 100644 --- a/src/WordTokenizers.jl +++ b/src/WordTokenizers.jl @@ -48,4 +48,11 @@ function __init__() init_vocab_datadeps() end +load(::Val{:ALBERT_V1}) = load_sp(ALBERT_V1) +load(::Val{:ALBERT_V2}) = load_sp(ALBERT_V2) +load(::Val{:GPT2}) = load_gpt2(GPT2) + +load(::Type{T}) where T<:PretrainedTokenizer = load(Val(Symbol(T))) + + end # module diff --git a/src/statistical/gpt2tokenizer.jl b/src/statistical/gpt2tokenizer.jl index 5510c3c..8e93dd4 100644 --- a/src/statistical/gpt2tokenizer.jl +++ b/src/statistical/gpt2tokenizer.jl @@ -39,7 +39,7 @@ struct GPT2Tokenizer end """ -load(ty::Type{T}) where T<:PretrainedTokenizer +load_gpt2(ty::Type{T}) where T<:PretrainedTokenizer Initializes the GPT2Tokenizer and loads the vocab and merges files from `DataDeps` #Example ```julia-repl @@ -47,7 +47,7 @@ julia> tokenizer = load(GPT2) ``` """ -function load(ty::Type{T}) where T<:PretrainedTokenizer +function load_gpt2(::Type{T}) where T<:PretrainedTokenizer GPT2Tokenizer(T) end diff --git a/src/statistical/unigram.jl b/src/statistical/unigram.jl index 2901b1d..b3808d3 100644 --- a/src/statistical/unigram.jl +++ b/src/statistical/unigram.jl @@ -3,7 +3,7 @@ struct SentencePieceModel vocab_map::Dict{String, Tuple{Float64, Int}} unk_id::Int end -structure, To hold unknown token index and map of vocabulary to log probability and index +structure, To hold unknown token index and map of vocabulary to log probability and index """ struct SentencePieceModel vocab_map::Dict{String, Tuple{Float64, Int}} @@ -11,25 +11,25 @@ struct SentencePieceModel end """ - load(ty::Type{T}, filenum::Int=1; unk_token="") where T<:PretrainedTokenizer + load_sp(ty::Type{T}, filenum::Int=1; unk_token="") where T<:PretrainedTokenizer use to initialize the `SentencePieceModel` by loading the file from `DataDeps` # Example ```julia-repl julia> spm = load(ALBERT_V1) ``` """ -function load(ty::Type{T}, filenum::Int=1; unk_token="") where T<:PretrainedTokenizer +function load_sp(ty::Type{T}, filenum::Int=1; unk_token="") where T<:PretrainedTokenizer filepath = @datadep_str tokenizer_files(ty)[filenum] name = tokenizer_files(ty)[filenum] filepath = "$filepath/$name" - load(filepath, unk_token=unk_token) + load_sp(filepath, unk_token=unk_token) end """ - load(path; unk_token="") + load_sp(path; unk_token="") use to initialize the SentencePieceModel by providing `vocab filepath` -""" -function load(path; unk_token="") +""" +function load_sp(path; unk_token="") vocab_path = readlines(path) vocabnlogp = split.(vocab_path, "\t") vocab_map = Dict(tok=>(parse(Float64, logp), index) for (index, (tok, logp)) in enumerate(vocabnlogp)) @@ -37,13 +37,13 @@ function load(path; unk_token="") unk_id = vocab_map[unk_token][2] else throw(DomainError(unk_token, "Unknown token is not in the vocabulary")) - end + end spm = SentencePieceModel(vocab_map, unk_id) return spm end """ -struct Nodes +struct Nodes text::String score::Float32 index::Int64 @@ -51,9 +51,9 @@ struct Nodes en::Int end Utility structure, To hold the results of the `forward pass` (the forward Viterbi lattice) -hold the token token string, score, vocabulary index, start and end character position +hold the token token string, score, vocabulary index, start and end character position """ -struct Nodes +struct Nodes text::String score::Float32 index::Int64 @@ -90,10 +90,10 @@ julia> node = WordTokenizers.decode_forward(spm, "I love julia language") WordTokenizers.Nodes("gua", -23.776f0, 15259, 17, 19) WordTokenizers.Nodes("ag", -34.1531f0, 3303, 19, 20) WordTokenizers.Nodes("language", -11.1965f0, 7021, 14, 21) -``` +``` """ function decode_forward(sp::SentencePieceModel, text::String) - results = Array{Nodes, 1}(undef, lastindex(text)) + results = Array{Nodes, 1}(undef, lastindex(text)) scores = fill(-Inf, lastindex(text)) scores[1] = 0 for char_end in eachindex(text) @@ -103,7 +103,7 @@ function decode_forward(sp::SentencePieceModel, text::String) if haskey(sp.vocab_map, subtoken) subtokenid = sp.vocab_map[subtoken][2] local_score = scores[char_start] + sp.vocab_map[subtoken][1] - if local_score > scores[char_end] + if local_score > scores[char_end] results[char_end] = Nodes(SubString(text, char_start:char_end), local_score, subtokenid, char_start, char_end) scores[char_end] = local_score end @@ -141,7 +141,7 @@ julia> WordTokenizers.decode_backward(spm, node, text) function decode_backward(sp::SentencePieceModel, nodes::Array{Nodes,1}, text::AbstractString) next_nodes = nodes[end] best_seq = Nodes[] - + while next_nodes.start > 1 node_value = next_nodes next_nodes = nodes[prevind(text, node_value.start)] @@ -166,7 +166,7 @@ function tokenizer(sp::SentencePieceModel, text::AbstractString) tokens = reverse(tokens) tks = [node.text for node in tokens] return tks - + end """ @@ -180,8 +180,8 @@ end """ ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1}) given tokens it provide its indices -""" -function ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1}) +""" +function ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1}) map(tk) do x last(get(spm.vocab_map, x, spm.unk_id)) end @@ -195,5 +195,5 @@ function sentence_from_tokens(tk::Array{String,1}) sen = join(tk) sen = replace(sen, "▁" => " ") sen = strip(sen) - return sen + return sen end diff --git a/test/gpt2_tokenizer.jl b/test/gpt2_tokenizer.jl index 4d5cab7..17b02b6 100644 --- a/test/gpt2_tokenizer.jl +++ b/test/gpt2_tokenizer.jl @@ -1,25 +1,25 @@ using WordTokenizers using Test -tokenizer = load(GPT2) +gpt2_tokenizer = load(GPT2) @testset "Pretrained" begin - @test typeof(tokenizer) == WordTokenizers.GPT2Tokenizer - @test typeof(tokenizer.vocab) == Dict{String, Any} - @test typeof(tokenizer.rank) == Dict{Pair{String,String}, Int} - @test typeof(tokenizer.cache) == Dict{String, Tuple} + @test typeof(gpt2_tokenizer) == WordTokenizers.GPT2Tokenizer + @test typeof(gpt2_tokenizer.vocab) == Dict{String, Any} + @test typeof(gpt2_tokenizer.rank) == Dict{Pair{String,String}, Int} + @test typeof(gpt2_tokenizer.cache) == Dict{String, Tuple} @test typeof(WordTokenizers.pretrained) == Dict{DataType,Array{String,1}} @test length(WordTokenizers.pretrained[GPT2]) == 2 end @testset "Tokenizer and helper function" begin - @test tokenizer.vocab["Hi"] == 17250 - @test tokenize("I love julia language", tokenizer) == ["I", + @test gpt2_tokenizer.vocab["Hi"] == 17250 + @test tokenize("I love julia language", gpt2_tokenizer) == ["I", "Ġlove", "Ġj", "ulia", "Ġlanguage"] - tokens = tokenize("I love julia language", tokenizer) - @test ids_from_tokens(tokens, tokenizer) == [40, 1842, 474, 43640, 3303] + tokens = tokenize("I love julia language", gpt2_tokenizer) + @test ids_from_tokens(tokens, gpt2_tokenizer) == [40, 1842, 474, 43640, 3303] @test sentence_from_tokens_gpt2(tokens) == "I love julia language" end diff --git a/test/runtests.jl b/test/runtests.jl index 0b789ff..588dfad 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,8 +8,8 @@ files = ["simple", "tweet_tokenize", "reversible_tok", "toktok", - "gpt2_tokenizer", - "sp_unigram" + "sp_unigram", + "gpt2_tokenizer" ] @testset "$file" for file in files From 1d90ed235668fcca73d9d213f2f9019d4f48d1fb Mon Sep 17 00:00:00 2001 From: Shikhar Goswami Date: Fri, 19 Mar 2021 12:53:49 +0530 Subject: [PATCH 4/7] Added more tests and did required changes --- .gitignore | 1 + Project.toml | 2 ++ src/WordTokenizers.jl | 4 +++- test/gpt2_tokenizer.jl | 17 +++++++++++++++++ 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8c960ec..3f02ca7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.jl.cov *.jl.*.cov *.jl.mem +Manifest.toml diff --git a/Project.toml b/Project.toml index 0e55d74..cf375d3 100644 --- a/Project.toml +++ b/Project.toml @@ -15,6 +15,8 @@ DataDeps = "0.6.5, 0.7" HTML_Entities = "1" StrTables = "1" julia = "1" +JSON = "0.21.1" +InternedStrings = "0.7.0" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl index bbe84a7..e833d63 100644 --- a/src/WordTokenizers.jl +++ b/src/WordTokenizers.jl @@ -17,7 +17,9 @@ export poormans_tokenize, punctuation_space_tokenize, set_tokenizer, set_sentence_splitter, rev_tokenize, rev_detokenize, toktok_tokenize -export ALBERT_V1, ALBERT_V2, load, tokenizer, sentence_from_tokens, ids_from_tokens, GPT2, GPT2Tokenizer, tokenize, sentence_from_tokens_gpt2 + +export ALBERT_V1, ALBERT_V2, GPT2 +export load, tokenizer, sentence_from_tokens, ids_from_tokens, tokenize, sentence_from_tokens_gpt2 export PretrainedTokenizer, tokenizer_files include("words/fast.jl") diff --git a/test/gpt2_tokenizer.jl b/test/gpt2_tokenizer.jl index 17b02b6..9f0a6c6 100644 --- a/test/gpt2_tokenizer.jl +++ b/test/gpt2_tokenizer.jl @@ -22,4 +22,21 @@ end tokens = tokenize("I love julia language", gpt2_tokenizer) @test ids_from_tokens(tokens, gpt2_tokenizer) == [40, 1842, 474, 43640, 3303] @test sentence_from_tokens_gpt2(tokens) == "I love julia language" + + tokens= tokenize("A census taker once tried to test me. I ate his liver with some fava beans and a nice Chianti.", gpt2_tokenizer) + @test tokens == ["A", "Ġcensus", "Ġt", "aker", "Ġonce", + "Ġtried", "Ġto", "Ġtest", "Ġme", ".", + "ĠI", "Ġate", "Ġhis", "Ġliver", "Ġwith", + "Ġsome", "Ġfav", "a","Ġbeans", "Ġand", + "Ġa", "Ġnice", "ĠCh", "iant", "i", "."] + @test ids_from_tokens(tokens, gpt2_tokenizer) == [32, 21649, 256, 3110, 1752, 3088, 284, 1332, 502, 13, 314, 15063, + 465, 14383, 351, 617, 2090, 64, 16567, 290, 257, 3621, 609, 3014, + 72, 13] + + text = "Badges? We ain't got no badges:) We don't need no badges:p I don't have to show you any stinking badges!" + tokens = tokenize(text, gpt2_tokenizer) + @test tokens == ["Bad", "ges", "?", "ĠWe", "Ġain", "'t", "Ġgot", "Ġno", "Ġbadges", ":", ")", "ĠWe", + "Ġdon", "'t", "Ġneed", "Ġno", "Ġbadges", ":", "p", "ĠI", "Ġdon", "'t", "Ġhave", + "Ġto", "Ġshow", "Ġyou", "Ġany", "Ġst", "inking", "Ġbadges", "!"] + @test sentence_from_tokens_gpt2(tokens) == text end From 6f2f448535461c1c1844ff900a50c2514fff5bf6 Mon Sep 17 00:00:00 2001 From: Shikhar Goswami Date: Mon, 22 Mar 2021 13:01:17 +0530 Subject: [PATCH 5/7] Standardised API to match with existing one --- Project.toml | 2 +- src/WordTokenizers.jl | 2 +- src/statistical/gpt2tokenizer.jl | 16 ++++++++-------- src/statistical/unigram.jl | 10 +++++----- test/gpt2_tokenizer.jl | 16 ++++++++-------- test/sp_unigram.jl | 12 ++++++------ 6 files changed, 29 insertions(+), 29 deletions(-) diff --git a/Project.toml b/Project.toml index cf375d3..526c3ec 100644 --- a/Project.toml +++ b/Project.toml @@ -14,7 +14,7 @@ Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" DataDeps = "0.6.5, 0.7" HTML_Entities = "1" StrTables = "1" -julia = "1" +julia = "1, 1.1" JSON = "0.21.1" InternedStrings = "0.7.0" diff --git a/src/WordTokenizers.jl b/src/WordTokenizers.jl index e833d63..70b46aa 100644 --- a/src/WordTokenizers.jl +++ b/src/WordTokenizers.jl @@ -19,7 +19,7 @@ export poormans_tokenize, punctuation_space_tokenize, toktok_tokenize export ALBERT_V1, ALBERT_V2, GPT2 -export load, tokenizer, sentence_from_tokens, ids_from_tokens, tokenize, sentence_from_tokens_gpt2 +export load, tokenize, sentence_from_tokens, ids_from_tokens export PretrainedTokenizer, tokenizer_files include("words/fast.jl") diff --git a/src/statistical/gpt2tokenizer.jl b/src/statistical/gpt2tokenizer.jl index 8e93dd4..6732311 100644 --- a/src/statistical/gpt2tokenizer.jl +++ b/src/statistical/gpt2tokenizer.jl @@ -148,7 +148,7 @@ function bpe(token::String, tokenizer::GPT2Tokenizer) end """ -tokenize(text::String, tokenizer::GPT2Tokenizer) +tokenize(tokenizer::GPT2Tokenizer, text::String) Implements tokenization of input text. This tokenizer doesn't include unknown and special tokens because of its byte-level BPE tokenization. GPT2 model is only trained on end token `<|endoftext|>`. Has to be manually added after the tokenization. @@ -156,7 +156,7 @@ GPT2 Tokenizer treats whitespace as unicode character `\u0120 (Ġ)` before a wor # Example ```julia-repl -julia> tokens = tokenize("Hi! How you doin", tokenizer) +julia> tokens = tokenize(tokenizer, "Hi! How you doin") 6-element Array{String,1}: "Hi" "!" @@ -166,7 +166,7 @@ julia> tokens = tokenize("Hi! How you doin", tokenizer) "in" ``` """ -function tokenize(text::String, tokenizer::GPT2Tokenizer) +function tokenize(tokenizer::GPT2Tokenizer, text::String) mapping = bytes_to_unicode() tokens=Vector{String}() matches = map(eachmatch(tokenizer.pat, text)) do m @@ -180,7 +180,7 @@ function tokenize(text::String, tokenizer::GPT2Tokenizer) end """ -ids_from_tokens(tokens::Vector{String}, tokenizer::GPT2Tokenizer) +ids_from_tokens(tokenizer::GPT2Tokenizer, tokens::Vector{String}) Returns respective ids of tokens from pretrained vocabulary map # Example @@ -194,7 +194,7 @@ julia> tokens = tokenize("Hi! How you doin", tokenizer) "Ġdo" "in" -julia> ids_from_tokens(tokens, tokenizer) +julia> ids_from_tokens(tokenizer, tokens) 6-element Array{Int64,1}: 17250 0 @@ -204,14 +204,14 @@ julia> ids_from_tokens(tokens, tokenizer) 259 ``` """ -function ids_from_tokens(tokens::Vector{String}, tokenizer::GPT2Tokenizer) +function ids_from_tokens(tokenizer::GPT2Tokenizer, tokens::Vector{String}) map(tokens) do x last(get(tokenizer.vocab, x, 0)) end end -function sentence_from_tokens_gpt2(tk::Array{String,1}) - sen = join(tk) +function sentence_from_tokens(tokenizer::GPT2Tokenizer, tokens::Array{String,1}) + sen = join(tokens) sen = replace(sen, "Ġ" => " ") sen = strip(sen) return sen diff --git a/src/statistical/unigram.jl b/src/statistical/unigram.jl index b3808d3..dd4df0d 100644 --- a/src/statistical/unigram.jl +++ b/src/statistical/unigram.jl @@ -152,11 +152,11 @@ function decode_backward(sp::SentencePieceModel, nodes::Array{Nodes,1}, text::Ab end """ - tokenizer(sp::SentencePieceModel,text::AbstractString) + tokenize(sp::SentencePieceModel,text::AbstractString) It does all the preprocessing step needed and perform `decode_forward` and `decode_backward` ouput tokenize tokens as Array{String,1} """ -function tokenizer(sp::SentencePieceModel, text::AbstractString) +function tokenize(sp::SentencePieceModel, text::AbstractString) text = replace(text, " " => "▁") if text[1] != '▁' text = "▁" * text @@ -174,7 +174,7 @@ end It does all the preprocessing step needed and perform `decode_forward` and `decode_backward`. """ function (sp::SentencePieceModel)(text::AbstractString) - tokenizer(sp, text) + tokenize(sp, text) end """ @@ -188,10 +188,10 @@ function ids_from_tokens(spm::SentencePieceModel, tk::Array{String,1}) end """ - sentence_from_tokens(tk::Array{String,1}) + sentence_from_tokens(spm::SentencePieceModel, tk::Array{String,1}) given tokens it provide its sentences """ -function sentence_from_tokens(tk::Array{String,1}) +function sentence_from_tokens(spm::SentencePieceModel, tk::Array{String,1}) sen = join(tk) sen = replace(sen, "▁" => " ") sen = strip(sen) diff --git a/test/gpt2_tokenizer.jl b/test/gpt2_tokenizer.jl index 9f0a6c6..195daae 100644 --- a/test/gpt2_tokenizer.jl +++ b/test/gpt2_tokenizer.jl @@ -14,29 +14,29 @@ end @testset "Tokenizer and helper function" begin @test gpt2_tokenizer.vocab["Hi"] == 17250 - @test tokenize("I love julia language", gpt2_tokenizer) == ["I", + @test tokenize(gpt2_tokenizer, "I love julia language") == ["I", "Ġlove", "Ġj", "ulia", "Ġlanguage"] - tokens = tokenize("I love julia language", gpt2_tokenizer) - @test ids_from_tokens(tokens, gpt2_tokenizer) == [40, 1842, 474, 43640, 3303] - @test sentence_from_tokens_gpt2(tokens) == "I love julia language" + tokens = tokenize(gpt2_tokenizer, "I love julia language") + @test ids_from_tokens(gpt2_tokenizer, tokens) == [40, 1842, 474, 43640, 3303] + @test sentence_from_tokens(gpt2_tokenizer, tokens) == "I love julia language" - tokens= tokenize("A census taker once tried to test me. I ate his liver with some fava beans and a nice Chianti.", gpt2_tokenizer) + tokens= tokenize(gpt2_tokenizer, "A census taker once tried to test me. I ate his liver with some fava beans and a nice Chianti.") @test tokens == ["A", "Ġcensus", "Ġt", "aker", "Ġonce", "Ġtried", "Ġto", "Ġtest", "Ġme", ".", "ĠI", "Ġate", "Ġhis", "Ġliver", "Ġwith", "Ġsome", "Ġfav", "a","Ġbeans", "Ġand", "Ġa", "Ġnice", "ĠCh", "iant", "i", "."] - @test ids_from_tokens(tokens, gpt2_tokenizer) == [32, 21649, 256, 3110, 1752, 3088, 284, 1332, 502, 13, 314, 15063, + @test ids_from_tokens(gpt2_tokenizer, tokens) == [32, 21649, 256, 3110, 1752, 3088, 284, 1332, 502, 13, 314, 15063, 465, 14383, 351, 617, 2090, 64, 16567, 290, 257, 3621, 609, 3014, 72, 13] text = "Badges? We ain't got no badges:) We don't need no badges:p I don't have to show you any stinking badges!" - tokens = tokenize(text, gpt2_tokenizer) + tokens = tokenize(gpt2_tokenizer, text) @test tokens == ["Bad", "ges", "?", "ĠWe", "Ġain", "'t", "Ġgot", "Ġno", "Ġbadges", ":", ")", "ĠWe", "Ġdon", "'t", "Ġneed", "Ġno", "Ġbadges", ":", "p", "ĠI", "Ġdon", "'t", "Ġhave", "Ġto", "Ġshow", "Ġyou", "Ġany", "Ġst", "inking", "Ġbadges", "!"] - @test sentence_from_tokens_gpt2(tokens) == text + @test sentence_from_tokens(gpt2_tokenizer, tokens) == text end diff --git a/test/sp_unigram.jl b/test/sp_unigram.jl index f85686e..416ca1e 100644 --- a/test/sp_unigram.jl +++ b/test/sp_unigram.jl @@ -16,12 +16,12 @@ end end @testset "Tokenizers and helper function" begin @test spm.vocab_map["now"][2] == 1388 - @test tokenizer(spm, "I love julia language") == ["▁", - "I", - "▁love", - "▁julia", + @test tokenize(spm, "I love julia language") == ["▁", + "I", + "▁love", + "▁julia", "▁language"] - tks = tokenizer(spm, "i love julia language") + tks = tokenize(spm, "i love julia language") @test ids_from_tokens(spm, tks) == [32, 340, 5424, 817] - @test sentence_from_tokens(tks) == "i love julia language" + @test sentence_from_tokens(spm, tks) == "i love julia language" end From 6d2e8a9e5dd9fe82a3aeadbc927a5178b2a4de71 Mon Sep 17 00:00:00 2001 From: Shikhar Goswami Date: Mon, 22 Mar 2021 13:25:36 +0530 Subject: [PATCH 6/7] Corrected and Modified README --- README.md | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 148f666..b394a81 100644 --- a/README.md +++ b/README.md @@ -294,42 +294,50 @@ julia> tokenize("hi__hello") "__" "hihello" ``` -# Statistical Tokenizer +# Statistical Tokenizer -**Sentencepiece Unigram Encoder** is basically the Sentencepiece processor's re-implementation in julia. It can used vocab file generated by sentencepiece library containing both vocab and log probability. + - **Sentencepiece Unigram Encoder** is basically the Sentencepiece processor's re-implementation in julia. It can used vocab file generated by sentencepiece library containing both vocab and log probability. + For more detail about implementation refer the blog post [here](https://tejasvaidhyadev.github.io/blog/Sentencepiece) -For more detail about implementation refer the blog post [here](https://tejasvaidhyadev.github.io/blog/Sentencepiece) + - **GPT2 Tokenizer** is the subword tokenizer which uses Byte Level Pair Encoding to split unknown words into known subwords present in it's pretrained vocabulary. **Note** : - SentencePiece escapes the whitespace with a meta symbol "▁" (U+2581). +- GPT2Tokenizer treats whitespace before a word as part of the word and escapes it with meta symbol "Ġ" (U+0120). -### Pretrained +### Pretrained -Wordtokenizer provides pretrained vocab file of Albert (both version-1 and version-2) +Wordtokenizers provides pretrained vocab file of Albert (both version-1 and version-2) and GPT2. You can initialize the tokenizers by load function. ```julia julia> subtypes(PretrainedTokenizer) 2-element Array{Any,1}: ALBERT_V1 ALBERT_V2 + GPT2 -julia> tokenizerfiles(ALBERT_V1) +julia> tokenizer_files(ALBERT_V1) 4-element Array{String,1}: "albert_base_v1_30k-clean.vocab" "albert_large_v1_30k-clean.vocab" - "albert_xlarge_v1_30k-clean.vocab" + "albert_xlarge_v1_30k-clean.vocab" "albert_xxlarge_v1_30k-clean.vocab" + +julia> tokenizer_files(GPT2) +2-element Array{String,1}: + "GPT2/encoder.json" + "GPT2/vocab.bpe" ``` `DataDeps` will handle all the downloading part for us. You can also create an issue or PR for other pretrained models or directly load by providing path in `load` function ```julia -julia> spm = load(Albert_Version1) #loading Default Albert-base vocab in Sentencepiece +julia> spm = load(ALBERT_V1) #loading Default Albert-base vocab in Sentencepiece WordTokenizers.SentencePieceModel(Dict("▁shots"=>(-11.2373, 7281),"▁ordered"=>(-9.84973, 1906),"dev"=>(-12.0915, 14439),"▁silv"=>(-12.6564, 21065),"▁doubtful"=>(-12.7799, 22569),"▁without"=>(-8.34227, 367),"▁pol"=>(-10.7694, 4828),"chem"=>(-12.3713, 17661),"▁1947,"=>(-11.7544, 11199),"▁disrespect"=>(-13.13, 26682)…), 2) -julia> tk = tokenizer(spm, "i love the julia language") #or tk = spm("i love the julia language") +julia> tk = tokenize(spm, "i love the julia language") #or tk = spm("i love the julia language") 4-element Array{String,1}: "▁i" "▁love" @@ -337,7 +345,7 @@ julia> tk = tokenizer(spm, "i love the julia language") #or tk = spm("i love the "▁julia" "▁language" -julia> subword = tokenizer(spm, "unfriendly") +julia> subword = tokenize(spm, "unfriendly") 2-element Array{String,1}: "▁un" "friendly" @@ -359,8 +367,8 @@ julia> para = spm("Julia is a high-level, high-performance dynamic language for "▁dynamic" "▁language" "▁for" - "▁technical" - "▁computing" + "▁technical" + "▁computing" ``` Indices is usually used for deep learning models. @@ -382,13 +390,13 @@ julia> ids_from_tokens(spm, tk) 5424 817 #we can also get sentences back from tokens -julia> sentence_from_tokens(tk) +julia> sentence_from_tokens(spm, tk) "i love the julia language" -julia> sentence_from_token(subword) +julia> sentence_from_tokens(spm, subword) "unfriendly" -julia> sentence_from_tokens(para) +julia> sentence_from_tokens(spm, para) "Julia is a high-level, high-performance dynamic language for technical computing" ``` From c03685cde60c129e700ed9c1fe558a5b183154b5 Mon Sep 17 00:00:00 2001 From: Shikhar Goswami Date: Mon, 22 Mar 2021 13:52:05 +0530 Subject: [PATCH 7/7] Deleted Manifest.toml --- Manifest.toml | 167 -------------------------------------------------- 1 file changed, 167 deletions(-) delete mode 100644 Manifest.toml diff --git a/Manifest.toml b/Manifest.toml deleted file mode 100644 index ca5eecd..0000000 --- a/Manifest.toml +++ /dev/null @@ -1,167 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -[[Artifacts]] -deps = ["Pkg"] -git-tree-sha1 = "c30985d8821e0cd73870b17b0ed0ce6dc44cb744" -uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" -version = "1.3.0" - -[[Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[BinaryProvider]] -deps = ["Libdl", "Logging", "SHA"] -git-tree-sha1 = "ecdec412a9abc8db54c0efc5548c64dfce072058" -uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" -version = "0.5.10" - -[[DataDeps]] -deps = ["BinaryProvider", "HTTP", "Libdl", "Reexport", "SHA", "p7zip_jll"] -git-tree-sha1 = "4f0e41ff461d42cfc62ff0de4f1cd44c6e6b3771" -uuid = "124859b0-ceae-595e-8997-d05f6a7a8dfe" -version = "0.7.7" - -[[Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" - -[[HTML_Entities]] -deps = ["StrTables"] -git-tree-sha1 = "aa19515d6ebe7f91a39cfc1dc6341f38fcac1282" -uuid = "7693890a-d069-55fe-a829-b4a6d304f0ee" -version = "1.0.0" - -[[HTTP]] -deps = ["Base64", "Dates", "IniFile", "MbedTLS", "NetworkOptions", "Sockets", "URIs"] -git-tree-sha1 = "c9f380c76d8aaa1fa7ea9cf97bddbc0d5b15adc2" -uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" -version = "0.9.5" - -[[IniFile]] -deps = ["Test"] -git-tree-sha1 = "098e4d2c533924c921f9f9847274f2ad89e018b8" -uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f" -version = "0.5.0" - -[[InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[InternedStrings]] -deps = ["Random", "Test"] -git-tree-sha1 = "eb05b5625bc5d821b8075a77e4c421933e20c76b" -uuid = "7d512f48-7fb1-5a58-b986-67e6dc259f01" -version = "0.7.0" - -[[JLLWrappers]] -git-tree-sha1 = "a431f5f2ca3f4feef3bd7a5e94b8b8d4f2f647a0" -uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.2.0" - -[[JSON]] -deps = ["Dates", "Mmap", "Parsers", "Unicode"] -git-tree-sha1 = "81690084b6198a2e1da36fcfda16eeca9f9f24e4" -uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" -version = "0.21.1" - -[[LibGit2]] -deps = ["Printf"] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[MbedTLS]] -deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"] -git-tree-sha1 = "1c38e51c3d08ef2278062ebceade0e46cefc96fe" -uuid = "739be429-bea8-5141-9913-cc70e7f3736d" -version = "1.0.3" - -[[MbedTLS_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "0eef589dd1c26a3ac9d753fe1a8bcad63f956fa6" -uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" -version = "2.16.8+1" - -[[Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[NetworkOptions]] -git-tree-sha1 = "ed3157f48a05543cce9b241e1f2815f7e843d96e" -uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" -version = "1.2.0" - -[[Parsers]] -deps = ["Dates"] -git-tree-sha1 = "223a825cccef2228f3fdbf2ecc7ca93363059073" -uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "1.0.16" - -[[Pkg]] -deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" - -[[Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[Random]] -deps = ["Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[Reexport]] -git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5" -uuid = "189a3867-3050-52da-a836-e630ba90ab69" -version = "1.0.0" - -[[SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" - -[[Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[Sockets]] -uuid = "6462fe0b-24de-5631-8697-dd941f90decc" - -[[StrTables]] -deps = ["Dates"] -git-tree-sha1 = "5998faae8c6308acc25c25896562a1e66a3bb038" -uuid = "9700d1a9-a7c8-5760-9816-a99fda30bb8f" -version = "1.0.1" - -[[Test]] -deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] -uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[URIs]] -git-tree-sha1 = "7855809b88d7b16e9b029afd17880930626f54a2" -uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" -version = "1.2.0" - -[[UUIDs]] -deps = ["Random", "SHA"] -uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" - -[[Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" - -[[p7zip_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "ee65cfa19bea645698a0224bfa216f2b1c8b559f" -uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" -version = "16.2.0+3"