blob: 630c76fd7b9fe21e108b28f13981689a3943f2c9 [file] [log] [blame]
# this is an example Thrax configuration file
# <- this symbol indicates a comment
# each line should be a key-value pair separated by whitespace
###
### GRAMMAR OPTIONS
###
grammar hiero # or samt
reverse false
source-is-parsed false
target-is-parsed false
# default-nt X # X is the default anyway
min-rule-count 1
# the number of reducers
reducers 16
# not only do these next six options have the suggested values as given
# in Chiang's "Hierarchical Phrase-based Translation" (CL), they are also
# Thrax's default values! You could comment them out and the resulting grammar
# would be identical.
# maximum length of initial phrase pairs
initial-phrase-length 10
lex-source-words 10
lex-target-words 10
# maximum number of NTs in a rule
arity 0
# minimum number of aligned terminals in a rule
lexicality 1
# allow adjacent nonterminals on source side
adjacent-nts false
# allow unaligned words at boundaries of phrases
loose true
allow-abstract-rules false
allow-nonlexical-x false
allow-full-sentence-rules false
nonlex-source-length 5
nonlex-target-length 5
nonlex-source-words 5
nonlex-target-words 5
allow-double-plus false
rule-span-limit 12
phrase-penalty 2.718
# a whitespace seperated list of features
# in this example, the features are phrase translation probability,
# lexical probability, and phrase penalty
# features phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count
features e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty alignment
# the only option and default later we will want to add formats for other decoders such as moses and
# cdec, if they use other formats
output-format joshua
# label feature scores? each score will be output as name=score
label-feature-scores false
amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
amazon-jar s3://edu.jhu.cs.jonny/thrax.jar
amazon-num-instances 15
max-split-size 8388608
# the format should be:
# foreign sentence ||| english sentence ||| alignment
# where the english is either parsed or not depending on whether you want
# SAMT or you want Hiero.
#input-file s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en