src/kenlm/python/kenlm.pyx - joshua - Git at Google

 import os
 cimport _kenlm

 cdef bytes as_str(data):
     if isinstance(data, bytes):
         return data
     elif isinstance(data, unicode):
         return data.encode('utf8')
     raise TypeError('Cannot convert %s to string' % type(data))

 cdef class FullScoreReturn:
     """
     Wrapper around FullScoreReturn.

     Notes:
         `prob` has been renamed to `log_prob`
         `oov` has been added to flag whether the word is OOV
     """

     cdef float log_prob
     cdef int ngram_length
     cdef bint oov

     def __cinit__(self, log_prob, ngram_length, oov):
         self.log_prob = log_prob
         self.ngram_length = ngram_length
         self.oov = oov

     def __repr__(self):
         return '{0}({1}, {2}, {3})'.format(self.__class__.__name__, repr(self.log_prob), repr(self.ngram_length), repr(self.oov))

     property log_prob:
         def __get__(self):
             return self.log_prob

     property ngram_length:
         def __get__(self):
             return self.ngram_length

     property oov:
         def __get__(self):
             return self.oov

 cdef class State:
     """
     Wrapper around lm::ngram::State so that python code can make incremental queries.

     Notes:
         * rich comparisons
         * hashable
     """

     cdef _kenlm.State _c_state

     def __richcmp__(State qa, State qb, int op):
         r = qa._c_state.Compare(qb._c_state)
         if op == 0:    # <
             return r < 0
         elif op == 1:  # <=
             return r <= 0
         elif op == 2:  # ==
             return r == 0
         elif op == 3:  # !=
             return r != 0
         elif op == 4:  # >
             return r > 0
         else:          # >=
             return r >= 0

     def __hash__(self):
         return _kenlm.hash_value(self._c_state)


 cdef class LanguageModel:
     """
     This is not a strict wrapper, the interface is more pythonic.
     It loads models and queries full sentences.
     """

     cdef _kenlm.Model* model
     cdef public bytes path
     cdef _kenlm.const_Vocabulary* vocab

     def __init__(self, path):
         """
         Load the language model.

         :param path: path to an arpa file or a kenlm binary file.
         """
         self.path = os.path.abspath(as_str(path))
         try:
             self.model = _kenlm.LoadVirtual(self.path)
         except RuntimeError as exception:
             exception_message = str(exception).replace('\n', ' ')
             raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\
                     from exception
         self.vocab = &self.model.BaseVocabulary()

     def __dealloc__(self):
         del self.model

     property order:
         def __get__(self):
             return self.model.Order()

     def score(self, sentence, bos = True, eos = True):
         cdef list words = as_str(sentence).split()
         cdef _kenlm.State state
         if bos:
             self.model.BeginSentenceWrite(&state)
         else:
             self.model.NullContextWrite(&state)
         cdef _kenlm.State out_state
         cdef float total = 0
         for word in words:
             total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state)
             state = out_state
         if eos:
             total += self.model.BaseScore(&state, self.vocab.EndSentence(), &out_state)
         return total

     def full_scores(self, sentence, bos = True, eos = True):
         """
         full_scores(sentence, bos = True, eos = Ture) -> generate full scores (prob, ngram length, oov)
         @param sentence is a string (do not use boundary symbols)
         @param bos should kenlm add a bos state
         @param eos should kenlm add an eos state
         """
         cdef list words = as_str(sentence).split()
         cdef _kenlm.State state
         if bos:
             self.model.BeginSentenceWrite(&state)
         else:
             self.model.NullContextWrite(&state)
         cdef _kenlm.State out_state
         cdef _kenlm.FullScoreReturn ret
         cdef float total = 0
         cdef _kenlm.WordIndex wid
         for word in words:
             wid = self.vocab.Index(word)
             ret = self.model.BaseFullScore(&state, wid, &out_state)
             yield (ret.prob, ret.ngram_length, wid == 0)
             state = out_state
         if eos:
             ret = self.model.BaseFullScore(&state,
                 self.vocab.EndSentence(), &out_state)
             yield (ret.prob, ret.ngram_length, False)

     def __contains__(self, word):
         cdef bytes w = as_str(word)
         return (self.vocab.Index(w) != 0)

     def __repr__(self):
         return '<LanguageModel from {0}>'.format(os.path.basename(self.path))

     def __reduce__(self):
         return (_kenlm.LanguageModel, (self.path,))

 cdef class Model:
     """
     This is closer to a wrapper around lm::ngram::Model.
     """

     cdef _kenlm.Model* model
     cdef public bytes path
     cdef _kenlm.const_Vocabulary* vocab

     def __init__(self, path):
         """
         Load the language model.

         :param path: path to an arpa file or a kenlm binary file.
         """
         self.path = os.path.abspath(as_str(path))
         try:
             self.model = _kenlm.LoadVirtual(self.path)
         except RuntimeError as exception:
             exception_message = str(exception).replace('\n', ' ')
             raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\
                     from exception
         self.vocab = &self.model.BaseVocabulary()

     def __dealloc__(self):
         del self.model

     property order:
         def __get__(self):
             return self.model.Order()

     def BeginSentenceWrite(self, State state):
         """Change the given state to a BOS state."""
         self.model.BeginSentenceWrite(&state._c_state)

     def NullContextWrite(self, State state):
         """Change the given state to a NULL state."""
         self.model.NullContextWrite(&state._c_state)

     def BaseScore(self, State in_state, str word, State out_state):
         """
         Return p(word|in_state) and update the output state.
         Wrapper around model.BaseScore(in_state, Index(word), out_state)

         :param word: the suffix
         :param state: the context (defaults to NullContext)
         :returns: p(word|state)
         """
         cdef float total = self.model.BaseScore(&in_state._c_state, self.vocab.Index(as_str(word)), &out_state._c_state)
         return total

     def BaseFullScore(self, State in_state, str word, State out_state):
         """
         Wrapper around model.BaseScore(in_state, Index(word), out_state)

         :param word: the suffix
         :param state: the context (defaults to NullContext)
         :returns: FullScoreReturn(word|state)
         """
         cdef _kenlm.WordIndex wid = self.vocab.Index(as_str(word))
         cdef _kenlm.FullScoreReturn ret = self.model.BaseFullScore(&in_state._c_state, wid, &out_state._c_state)
         return FullScoreReturn(ret.prob, ret.ngram_length, wid == 0)

     def __contains__(self, word):
         cdef bytes w = as_str(word)
         return (self.vocab.Index(w) != 0)

     def __repr__(self):
         return '<Model from {0}>'.format(os.path.basename(self.path))

     def __reduce__(self):
         return (_kenlm.LanguageModel, (self.path,))
	import os
	cimport _kenlm

	cdef bytes as_str(data):
	if isinstance(data, bytes):
	return data
	elif isinstance(data, unicode):
	return data.encode('utf8')
	raise TypeError('Cannot convert %s to string' % type(data))

	cdef class FullScoreReturn:
	"""
	Wrapper around FullScoreReturn.

	Notes:
	`prob` has been renamed to `log_prob`
	`oov` has been added to flag whether the word is OOV
	"""

	cdef float log_prob
	cdef int ngram_length
	cdef bint oov

	def __cinit__(self, log_prob, ngram_length, oov):
	self.log_prob = log_prob
	self.ngram_length = ngram_length
	self.oov = oov

	def __repr__(self):
	return '{0}({1}, {2}, {3})'.format(self.__class__.__name__, repr(self.log_prob), repr(self.ngram_length), repr(self.oov))

	property log_prob:
	def __get__(self):
	return self.log_prob

	property ngram_length:
	def __get__(self):
	return self.ngram_length

	property oov:
	def __get__(self):
	return self.oov

	cdef class State:
	"""
	Wrapper around lm::ngram::State so that python code can make incremental queries.

	Notes:
	* rich comparisons
	* hashable
	"""

	cdef _kenlm.State _c_state

	def __richcmp__(State qa, State qb, int op):
	r = qa._c_state.Compare(qb._c_state)
	if op == 0: # <
	return r < 0
	elif op == 1: # <=
	return r <= 0
	elif op == 2: # ==
	return r == 0
	elif op == 3: # !=
	return r != 0
	elif op == 4: # >
	return r > 0
	else: # >=
	return r >= 0

	def __hash__(self):
	return _kenlm.hash_value(self._c_state)


	cdef class LanguageModel:
	"""
	This is not a strict wrapper, the interface is more pythonic.
	It loads models and queries full sentences.
	"""

	cdef _kenlm.Model* model
	cdef public bytes path
	cdef _kenlm.const_Vocabulary* vocab

	def __init__(self, path):
	"""
	Load the language model.

	:param path: path to an arpa file or a kenlm binary file.
	"""
	self.path = os.path.abspath(as_str(path))
	try:
	self.model = _kenlm.LoadVirtual(self.path)
	except RuntimeError as exception:
	exception_message = str(exception).replace('\n', ' ')
	raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\
	from exception
	self.vocab = &self.model.BaseVocabulary()

	def __dealloc__(self):
	del self.model

	property order:
	def __get__(self):
	return self.model.Order()

	def score(self, sentence, bos = True, eos = True):
	cdef list words = as_str(sentence).split()
	cdef _kenlm.State state
	if bos:
	self.model.BeginSentenceWrite(&state)
	else:
	self.model.NullContextWrite(&state)
	cdef _kenlm.State out_state
	cdef float total = 0
	for word in words:
	total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state)
	state = out_state
	if eos:
	total += self.model.BaseScore(&state, self.vocab.EndSentence(), &out_state)
	return total

	def full_scores(self, sentence, bos = True, eos = True):
	"""
	full_scores(sentence, bos = True, eos = Ture) -> generate full scores (prob, ngram length, oov)
	@param sentence is a string (do not use boundary symbols)
	@param bos should kenlm add a bos state
	@param eos should kenlm add an eos state
	"""
	cdef list words = as_str(sentence).split()
	cdef _kenlm.State state
	if bos:
	self.model.BeginSentenceWrite(&state)
	else:
	self.model.NullContextWrite(&state)
	cdef _kenlm.State out_state
	cdef _kenlm.FullScoreReturn ret
	cdef float total = 0
	cdef _kenlm.WordIndex wid
	for word in words:
	wid = self.vocab.Index(word)
	ret = self.model.BaseFullScore(&state, wid, &out_state)
	yield (ret.prob, ret.ngram_length, wid == 0)
	state = out_state
	if eos:
	ret = self.model.BaseFullScore(&state,
	self.vocab.EndSentence(), &out_state)
	yield (ret.prob, ret.ngram_length, False)

	def __contains__(self, word):
	cdef bytes w = as_str(word)
	return (self.vocab.Index(w) != 0)

	def __repr__(self):
	return '<LanguageModel from {0}>'.format(os.path.basename(self.path))

	def __reduce__(self):
	return (_kenlm.LanguageModel, (self.path,))

	cdef class Model:
	"""
	This is closer to a wrapper around lm::ngram::Model.
	"""

	cdef _kenlm.Model* model
	cdef public bytes path
	cdef _kenlm.const_Vocabulary* vocab

	def __init__(self, path):
	"""
	Load the language model.

	:param path: path to an arpa file or a kenlm binary file.
	"""
	self.path = os.path.abspath(as_str(path))
	try:
	self.model = _kenlm.LoadVirtual(self.path)
	except RuntimeError as exception:
	exception_message = str(exception).replace('\n', ' ')
	raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\
	from exception
	self.vocab = &self.model.BaseVocabulary()

	def __dealloc__(self):
	del self.model

	property order:
	def __get__(self):
	return self.model.Order()

	def BeginSentenceWrite(self, State state):
	"""Change the given state to a BOS state."""
	self.model.BeginSentenceWrite(&state._c_state)

	def NullContextWrite(self, State state):
	"""Change the given state to a NULL state."""
	self.model.NullContextWrite(&state._c_state)

	def BaseScore(self, State in_state, str word, State out_state):
	"""
	Return p(word\|in_state) and update the output state.
	Wrapper around model.BaseScore(in_state, Index(word), out_state)

	:param word: the suffix
	:param state: the context (defaults to NullContext)
	:returns: p(word\|state)
	"""
	cdef float total = self.model.BaseScore(&in_state._c_state, self.vocab.Index(as_str(word)), &out_state._c_state)
	return total

	def BaseFullScore(self, State in_state, str word, State out_state):
	"""
	Wrapper around model.BaseScore(in_state, Index(word), out_state)

	:param word: the suffix
	:param state: the context (defaults to NullContext)
	:returns: FullScoreReturn(word\|state)
	"""
	cdef _kenlm.WordIndex wid = self.vocab.Index(as_str(word))
	cdef _kenlm.FullScoreReturn ret = self.model.BaseFullScore(&in_state._c_state, wid, &out_state._c_state)
	return FullScoreReturn(ret.prob, ret.ngram_length, wid == 0)

	def __contains__(self, word):
	cdef bytes w = as_str(word)
	return (self.vocab.Index(w) != 0)

	def __repr__(self):
	return '<Model from {0}>'.format(os.path.basename(self.path))

	def __reduce__(self):
	return (_kenlm.LanguageModel, (self.path,))