| # this is an example Thrax configuration file |
| # <- this symbol indicates a comment |
| # each line should be a key-value pair separated by whitespace |
| |
| ### |
| ### GRAMMAR OPTIONS |
| ### |
| |
| grammar hiero # or samt |
| reverse false |
| source-is-parsed false |
| target-is-parsed false |
| # default-nt X # X is the default anyway |
| |
| min-rule-count 1 |
| |
| # the number of reducers |
| reducers 16 |
| |
| # not only do these next six options have the suggested values as given |
| # in Chiang's "Hierarchical Phrase-based Translation" (CL), they are also |
| # Thrax's default values! You could comment them out and the resulting grammar |
| # would be identical. |
| |
| # maximum length of initial phrase pairs |
| initial-phrase-length 10 |
| lex-source-words 10 |
| lex-target-words 10 |
| |
| # maximum number of NTs in a rule |
| arity 0 |
| |
| # minimum number of aligned terminals in a rule |
| lexicality 1 |
| |
| # allow adjacent nonterminals on source side |
| adjacent-nts false |
| |
| # allow unaligned words at boundaries of phrases |
| loose true |
| |
| allow-abstract-rules false |
| allow-nonlexical-x false |
| allow-full-sentence-rules false |
| |
| nonlex-source-length 5 |
| nonlex-target-length 5 |
| nonlex-source-words 5 |
| nonlex-target-words 5 |
| |
| allow-double-plus false |
| |
| rule-span-limit 12 |
| |
| phrase-penalty 2.718 |
| |
| # a whitespace seperated list of features |
| # in this example, the features are phrase translation probability, |
| # lexical probability, and phrase penalty |
| # features phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count |
| features e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty alignment |
| |
| # the only option and default later we will want to add formats for other decoders such as moses and |
| # cdec, if they use other formats |
| output-format joshua |
| |
| # label feature scores? each score will be output as name=score |
| label-feature-scores false |
| |
| amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero |
| amazon-jar s3://edu.jhu.cs.jonny/thrax.jar |
| amazon-num-instances 15 |
| |
| max-split-size 8388608 |
| |
| # the format should be: |
| # foreign sentence ||| english sentence ||| alignment |
| # where the english is either parsed or not depending on whether you want |
| # SAMT or you want Hiero. |
| #input-file s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en |