Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions src/statistical/unigram.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ structure, To hold unknown token index and map of vocabulary to log probability
struct SentencePieceModel
vocab_map::Dict{String, Tuple{Float64, Int}}
unk_id::Int
# used to make decoding more efficient
max_vocab_codeunit_len::Int

function SentencePieceModel(vocab_map, unk_id)
num_codeunits(idx) = isassigned(vocab_map.keys, idx) ? ncodeunits(vocab_map.keys[idx]) : 0
max_vocab_codeunit_len, _ = findmax(num_codeunits.(eachindex(vocab_map.keys)))
new(vocab_map, unk_id, max_vocab_codeunit_len)
end
end

"""
Expand Down Expand Up @@ -97,14 +105,16 @@ function decode_forward(sp::SentencePieceModel, text::String)
scores = fill(-Inf, lastindex(text))
scores[1] = 0
for char_end in eachindex(text)
for char_start in eachindex(text)
char_start > char_end && break
min_start = max(firstindex(text), char_end - sp.max_vocab_codeunit_len + 1)
candidate_text_substr = SubString(text, thisind(text, min_start), char_end)
for relative_idx in eachindex(candidate_text_substr)
char_start = candidate_text_substr.offset + relative_idx
subtoken = SubString(text, char_start:char_end)
if haskey(sp.vocab_map, subtoken)
subtokenid = sp.vocab_map[subtoken][2]
local_score = scores[char_start] + sp.vocab_map[subtoken][1]
if local_score > scores[char_end]
results[char_end] = Nodes(SubString(text, char_start:char_end), local_score, subtokenid, char_start, char_end)
results[char_end] = Nodes(subtoken, local_score, subtokenid, char_start, char_end)
scores[char_end] = local_score
end
end
Expand Down