blob: 8a18f3aed5a50cfbc4752f89eac6d4d9a8e2ec49 [file] [log] [blame]
import os
import kenlm
LM = os.path.join(os.path.dirname(__file__), '..', 'lm', 'test.arpa')
model = kenlm.LanguageModel(LM)
print('{0}-gram model'.format(model.order))
sentence = 'language modeling is fun .'
print(sentence)
print(model.score(sentence))
# Check that total full score = direct score
def score(s):
return sum(prob for prob, _, _ in model.full_scores(s))
assert (abs(score(sentence) - model.score(sentence)) < 1e-3)
# Show scores and n-gram matches
words = ['<s>'] + sentence.split() + ['</s>']
for i, (prob, length, oov) in enumerate(model.full_scores(sentence)):
print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2])))
if oov:
print('\t"{0}" is an OOV'.format(words[i+1]))
# Find out-of-vocabulary words
for w in words:
if not w in model:
print('"{0}" is an OOV'.format(w))