scripts/training/templates/thrax-phrase.conf - joshua - Git at Google

 # this is an example Thrax configuration file
 # <- this symbol indicates a comment
 # each line should be a key-value pair separated by whitespace

 ###
 ### GRAMMAR OPTIONS
 ###

 grammar     hiero   # or samt
 reverse     false
 source-is-parsed    false
 target-is-parsed    false
 # default-nt    X   # X is the default anyway

 min-rule-count 1

 # the number of reducers
 reducers 16

 # not only do these next six options have the suggested values as given
 # in Chiang's "Hierarchical Phrase-based Translation" (CL), they are also
 # Thrax's default values! You could comment them out and the resulting grammar
 # would be identical.

 # maximum length of initial phrase pairs
 initial-phrase-length   10
 lex-source-words        10
 lex-target-words        10

 # maximum number of NTs in a rule
 arity                   0

 # minimum number of aligned terminals in a rule
 lexicality              1

 # allow adjacent nonterminals on source side
 adjacent-nts    false

 # allow unaligned words at boundaries of phrases
 loose           true

 allow-abstract-rules    false
 allow-nonlexical-x      false
 allow-full-sentence-rules   false

 nonlex-source-length    5
 nonlex-target-length    5
 nonlex-source-words     5
 nonlex-target-words     5

 allow-double-plus    false

 rule-span-limit         12

 phrase-penalty  2.718

 # a whitespace seperated list of features
 # in this example, the features are phrase translation probability,
 # lexical probability, and phrase penalty
 # features        phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count
 features        e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty alignment

 # the only option and default later we will want to add formats for other decoders such as moses and
 # cdec, if they use other formats
 output-format   joshua

 # label feature scores? each score will be output as name=score
 label-feature-scores false

 amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
 amazon-jar  s3://edu.jhu.cs.jonny/thrax.jar
 amazon-num-instances    15

 max-split-size  8388608

 # the format should be:
 # foreign sentence ||| english sentence ||| alignment
 # where the english is either parsed or not depending on whether you want
 # SAMT or you want Hiero.
 #input-file  s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en
	# this is an example Thrax configuration file
	# <- this symbol indicates a comment
	# each line should be a key-value pair separated by whitespace

	###
	### GRAMMAR OPTIONS
	###

	grammar hiero # or samt
	reverse false
	source-is-parsed false
	target-is-parsed false
	# default-nt X # X is the default anyway

	min-rule-count 1

	# the number of reducers
	reducers 16

	# not only do these next six options have the suggested values as given
	# in Chiang's "Hierarchical Phrase-based Translation" (CL), they are also
	# Thrax's default values! You could comment them out and the resulting grammar
	# would be identical.

	# maximum length of initial phrase pairs
	initial-phrase-length 10
	lex-source-words 10
	lex-target-words 10

	# maximum number of NTs in a rule
	arity 0

	# minimum number of aligned terminals in a rule
	lexicality 1

	# allow adjacent nonterminals on source side
	adjacent-nts false

	# allow unaligned words at boundaries of phrases
	loose true

	allow-abstract-rules false
	allow-nonlexical-x false
	allow-full-sentence-rules false

	nonlex-source-length 5
	nonlex-target-length 5
	nonlex-source-words 5
	nonlex-target-words 5

	allow-double-plus false

	rule-span-limit 12

	phrase-penalty 2.718

	# a whitespace seperated list of features
	# in this example, the features are phrase translation probability,
	# lexical probability, and phrase penalty
	# features phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count
	features e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty alignment

	# the only option and default later we will want to add formats for other decoders such as moses and
	# cdec, if they use other formats
	output-format joshua

	# label feature scores? each score will be output as name=score
	label-feature-scores false

	amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
	amazon-jar s3://edu.jhu.cs.jonny/thrax.jar
	amazon-num-instances 15

	max-split-size 8388608

	# the format should be:
	# foreign sentence \|\|\| english sentence \|\|\| alignment
	# where the english is either parsed or not depending on whether you want
	# SAMT or you want Hiero.
	#input-file s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en