scripts/features/addSparseFeatures.py - joshua - Git at Google

 #!/usr/bin/env python

 import sys
 import gzip
 import argparse

 parser = argparse.ArgumentParser("Adds sparse features to a Moses ttable")
 parser.add_argument("-p", "--ttable", dest="filteredPT", help="A phrase table, preferably a filtered one")
 parser.add_argument("-o", "--output", dest="featurizedPT", help="The location of the output ttable", default="tuning/filtered.1/phrase-table.ft.0-0.1.1.gz")
 parser.add_argument("-f", "--sparse_f", dest="sparseF", help="Source sparse features", default="model/sparse-features.1.en.top1000")
 parser.add_argument("-e", "--sparse_e", dest="sparseE", help="Target sparse features", default="model/sparse-features.1.es.top1000")
 opts = parser.parse_args()

 if opts.filteredPT is None:
     parser.print_help()
     sys.exit()

 filteredPT = opts.filteredPT
 featurizedPT = gzip.open(opts.featurizedPT, 'wb')
 sparseE = opts.sparseE
 sparseF = opts.sparseF

 featsE = []
 featsF = []

 # First read off the sparse features and store them
 with open(sparseE) as sE:
     for line in sE:
         line = line.strip()
         featsE.append(line)
 with open(sparseF) as sF:
     for line in sF:
         line = line.strip()
         featsF.append(line)

 wt = set()
 phraseWT = []

 pt = gzip.open(filteredPT, 'rb')
 for line in pt:
     lineComp = line.split("|||")
     assert len(lineComp) > 3
     sPhrase = lineComp[0].strip().split()
     tPhrase = lineComp[1].strip().split()
     alignment = lineComp[3].strip().split()
     # Cache phrase features for use later
     localWT = set()
     # Read aligment infomation
     for item in alignment:
         item = item.split("-")
         # Add seen word translations to a set
         # if they were seen in the lexical features
         sWord = sPhrase[int(item[0])]
         tWord = tPhrase[int(item[1])]
         if sWord in featsF and tWord in featsE:
             wt.add((sWord, tWord))
             localWT.add((sWord, tWord))

     phraseWT.append(localWT)

 pt.seek(0)
 # Convert to a list
 wt = list(wt)
 for i, line in enumerate(pt):
     lineComp = line.split("|||")
     assert len(lineComp) > 3
     sPhrase = lineComp[0].strip().split()
     tPhrase = lineComp[1].strip().split()
     # Lexical sparse features
     # SD = Source word deletion, TI = target word deletion
     # WT = word translation
     sd_features = ["SD_"+token+"=1" for token in sPhrase if token in featsF]
     ti_features = ["TI_"+token+"=1" for token in tPhrase if token in featsE]
     wt_features = []
     for feat in phraseWT[i]:
         wt_features.append("WT_" + feat[0] + "~" + feat[1] + "=1")

     all_feats = sd_features + ti_features + wt_features
     # wt_features = ["1" if feat in phraseWT[i] else "0" for feat in wt]
     lineComp[2] += " ".join(all_feats) + " "
     featurizedPT.write("|||".join(lineComp))
     sys.stdout.write("\r%f%%" % (float(i * 100)/len(phraseWT)))
     sys.stdout.flush()

 featurizedPT.close()
	#!/usr/bin/env python

	import sys
	import gzip
	import argparse

	parser = argparse.ArgumentParser("Adds sparse features to a Moses ttable")
	parser.add_argument("-p", "--ttable", dest="filteredPT", help="A phrase table, preferably a filtered one")
	parser.add_argument("-o", "--output", dest="featurizedPT", help="The location of the output ttable", default="tuning/filtered.1/phrase-table.ft.0-0.1.1.gz")
	parser.add_argument("-f", "--sparse_f", dest="sparseF", help="Source sparse features", default="model/sparse-features.1.en.top1000")
	parser.add_argument("-e", "--sparse_e", dest="sparseE", help="Target sparse features", default="model/sparse-features.1.es.top1000")
	opts = parser.parse_args()

	if opts.filteredPT is None:
	parser.print_help()
	sys.exit()

	filteredPT = opts.filteredPT
	featurizedPT = gzip.open(opts.featurizedPT, 'wb')
	sparseE = opts.sparseE
	sparseF = opts.sparseF

	featsE = []
	featsF = []

	# First read off the sparse features and store them
	with open(sparseE) as sE:
	for line in sE:
	line = line.strip()
	featsE.append(line)
	with open(sparseF) as sF:
	for line in sF:
	line = line.strip()
	featsF.append(line)

	wt = set()
	phraseWT = []

	pt = gzip.open(filteredPT, 'rb')
	for line in pt:
	lineComp = line.split("\|\|\|")
	assert len(lineComp) > 3
	sPhrase = lineComp[0].strip().split()
	tPhrase = lineComp[1].strip().split()
	alignment = lineComp[3].strip().split()
	# Cache phrase features for use later
	localWT = set()
	# Read aligment infomation
	for item in alignment:
	item = item.split("-")
	# Add seen word translations to a set
	# if they were seen in the lexical features
	sWord = sPhrase[int(item[0])]
	tWord = tPhrase[int(item[1])]
	if sWord in featsF and tWord in featsE:
	wt.add((sWord, tWord))
	localWT.add((sWord, tWord))

	phraseWT.append(localWT)

	pt.seek(0)
	# Convert to a list
	wt = list(wt)
	for i, line in enumerate(pt):
	lineComp = line.split("\|\|\|")
	assert len(lineComp) > 3
	sPhrase = lineComp[0].strip().split()
	tPhrase = lineComp[1].strip().split()
	# Lexical sparse features
	# SD = Source word deletion, TI = target word deletion
	# WT = word translation
	sd_features = ["SD_"+token+"=1" for token in sPhrase if token in featsF]
	ti_features = ["TI_"+token+"=1" for token in tPhrase if token in featsE]
	wt_features = []
	for feat in phraseWT[i]:
	wt_features.append("WT_" + feat[0] + "~" + feat[1] + "=1")

	all_feats = sd_features + ti_features + wt_features
	# wt_features = ["1" if feat in phraseWT[i] else "0" for feat in wt]
	lineComp[2] += " ".join(all_feats) + " "
	featurizedPT.write("\|\|\|".join(lineComp))
	sys.stdout.write("\r%f%%" % (float(i * 100)/len(phraseWT)))
	sys.stdout.flush()

	featurizedPT.close()