| # This file is a template for the Joshua pipeline; variables enclosed |
| # in <angle-brackets> are substituted by the pipeline script as |
| # appropriate. This file also serves to document Joshua's many |
| # parameters. |
| |
| # These are the grammar file specifications. Joshua supports an |
| # arbitrary number of grammar files, each specified on its own line |
| # using the following format: |
| # |
| # tm = TYPE OWNER LIMIT FILE |
| # |
| # TYPE is "packed", "thrax", or "samt". The latter denotes the format |
| # used in Zollmann and Venugopal's SAMT decoder |
| # (http://www.cs.cmu.edu/~zollmann/samt/). |
| # |
| # OWNER is the "owner" of the rules in the grammar; this is used to |
| # determine which set of phrasal features apply to the grammar's |
| # rules. Having different owners allows different features to be |
| # applied to different grammars, and for grammars to share features |
| # across files. |
| # |
| # LIMIT is the maximum input span permitted for the application of |
| # grammar rules found in the grammar file. A value of -1 implies no limit. |
| # |
| # FILE is the grammar file (or directory when using packed grammars). |
| # The file can be compressed with gzip, which is determined by the |
| # presence or absence of a ".gz" file extension. |
| # |
| # By a convention defined by Chiang (2007), the grammars are split |
| # into two files: the main translation grammar containing all the |
| # learned translation rules, and a glue grammar which supports |
| # monotonic concatenation of hierarchical phrases. The glue grammar's |
| # main distinction from the regular grammar is that the span limit |
| # does not apply to it. |
| |
| tm = thrax pt 20 grammar |
| tm = thrax glue -1 glue |
| |
| # This symbol is used over unknown words in the source language |
| |
| default-non-terminal = OOV |
| |
| # This is the goal nonterminal, used to determine when a complete |
| # parse is found. It should correspond to the root-level rules in the |
| # glue grammar. |
| |
| goal-symbol = GOAL |
| |
| # Language model config. |
| |
| # Multiple language models are supported. For each language model, |
| # create a line in the following format, |
| # |
| # lm = TYPE 5 false false 100 FILE |
| # |
| # where the six fields correspond to the following values: |
| # - LM type: one of "kenlm", "berkeleylm", "javalm" (not recommended), or "none" |
| # - LM order: the N of the N-gram language model |
| # - whether to use left equivalent state (currently not supported) |
| # - whether to use right equivalent state (currently not supported) |
| # - the ceiling cost of any n-gram (currently ignored) |
| # - LM file: the location of the language model file |
| # You also need to add a weight for each language model below. |
| |
| # The suffix _OOV is appended to unknown source-language words if this |
| # is set to true. |
| |
| mark-oovs = false |
| |
| # The pop-limit for decoding. This determines how many hypotheses are |
| # considered over each span of the input. |
| |
| pop-limit = 100 |
| |
| # How many hypotheses to output |
| |
| top-n = 300 |
| |
| # Whether those hypotheses should be distinct strings |
| |
| use-unique-nbest = true |
| |
| # This is the default format of the ouput printed to STDOUT. The variables that can be |
| # substituted are: |
| # |
| # %i: the sentence number (0-indexed) |
| # %s: the translated sentence |
| # %t: the derivation tree |
| # %f: the feature string |
| # %c: the model cost |
| |
| output-format = %i ||| %s ||| %f ||| %c |
| |
| # When printing the trees (%t in 'output-format'), this controls whether the alignments |
| # are also printed. |
| |
| include-align-index = false |
| |
| ## Feature functions and weights. |
| |
| # And these are the feature functions to activate. |
| feature_function = OOVPenalty |
| feature_function = WordPenalty |
| |
| tm_pt_0 0 |
| tm_glue_0 0 |
| |
| OOVPenalty 1 |
| WordPenalty -1 |