scripts/support/phrase2hiero.py - joshua - Git at Google

 #!/usr/bin/python

 """
 Converts a Moses phrase table to a Joshua phrase table. The differences are
 (a) adding an LHS and (b) applying -log() to all the model weights.

 Usage: gzip -cd grammar.gz | phrase2hiero.py | gzip -9n > grammar.new.gz

 Author: Matt Post <post@cs.jhu.edu>
 Date:   June 2016
 """

 import sys
 import math
 import codecs

 reload(sys)
 sys.setdefaultencoding('utf-8')
 sys.stdin = codecs.getreader('utf-8')(sys.stdin)
 sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
 sys.stdout.encoding = 'utf-8'

 def maybelog(value):
     """Takes a feature value and returns -log(x) if it is a scalar"""
     try:
         return str(-1.0 * math.log(float(value)))
     except ValueError:
         return value

 for line in sys.stdin:
     moses = False

     # Moses phrase tables do not have a left-hand side symbol, add that
     if not line.startswith('['):
         line = '[X] ||| ' + line
         moses = True

     # Get all the fields
     tokens = line.split(r' ||| ')

     # take the -log() of each input token
     if moses and len(tokens) >= 4:
         tokens[3] = ' '.join(map(maybelog, tokens[3].split(' ')))

     print ' ||| '.join(tokens),
	#!/usr/bin/python

	"""
	Converts a Moses phrase table to a Joshua phrase table. The differences are
	(a) adding an LHS and (b) applying -log() to all the model weights.

	Usage: gzip -cd grammar.gz \| phrase2hiero.py \| gzip -9n > grammar.new.gz

	Author: Matt Post <post@cs.jhu.edu>
	Date: June 2016
	"""

	import sys
	import math
	import codecs

	reload(sys)
	sys.setdefaultencoding('utf-8')
	sys.stdin = codecs.getreader('utf-8')(sys.stdin)
	sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
	sys.stdout.encoding = 'utf-8'

	def maybelog(value):
	"""Takes a feature value and returns -log(x) if it is a scalar"""
	try:
	return str(-1.0 * math.log(float(value)))
	except ValueError:
	return value

	for line in sys.stdin:
	moses = False

	# Moses phrase tables do not have a left-hand side symbol, add that
	if not line.startswith('['):
	line = '[X] \|\|\| ' + line
	moses = True

	# Get all the fields
	tokens = line.split(r' \|\|\| ')

	# take the -log() of each input token
	if moses and len(tokens) >= 4:
	tokens[3] = ' '.join(map(maybelog, tokens[3].split(' ')))

	print ' \|\|\| '.join(tokens),