blob: 6f88ec8b0344676bfcdab5107d3ed79a551cfe6d [file] [log] [blame]
# This file is a template for the Joshua pipeline; variables enclosed
# in <angle-brackets> are substituted by the pipeline script as
# appropriate. This file also serves to document Joshua's many
# parameters.
# These are the grammar file specifications. Joshua supports an
# arbitrary number of grammar files, each specified on its own line
# using the following format:
# TYPE is "packed", "thrax", or "samt". The latter denotes the format
# used in Zollmann and Venugopal's SAMT decoder
# (
# OWNER is the "owner" of the rules in the grammar; this is used to
# determine which set of phrasal features apply to the grammar's
# rules. Having different owners allows different features to be
# applied to different grammars, and for grammars to share features
# across files.
# LIMIT is the maximum input span permitted for the application of
# grammar rules found in the grammar file. A value of -1 implies no limit.
# FILE is the grammar file (or directory when using packed grammars).
# The file can be compressed with gzip, which is determined by the
# presence or absence of a ".gz" file extension.
# By a convention defined by Chiang (2007), the grammars are split
# into two files: the main translation grammar containing all the
# learned translation rules, and a glue grammar which supports
# monotonic concatenation of hierarchical phrases. The glue grammar's
# main distinction from the regular grammar is that the span limit
# does not apply to it.
tm = thrax pt 20 grammar
tm = thrax glue -1 glue
# This symbol is used over unknown words in the source language
default-non-terminal = OOV
# This is the goal nonterminal, used to determine when a complete
# parse is found. It should correspond to the root-level rules in the
# glue grammar.
goal-symbol = GOAL
# Language model config.
# Multiple language models are supported. For each language model,
# create a line in the following format,
# lm = TYPE 5 false false 100 FILE
# where the six fields correspond to the following values:
# - LM type: one of "kenlm", "berkeleylm", "javalm" (not recommended), or "none"
# - LM order: the N of the N-gram language model
# - whether to use left equivalent state (currently not supported)
# - whether to use right equivalent state (currently not supported)
# - the ceiling cost of any n-gram (currently ignored)
# - LM file: the location of the language model file
# You also need to add a weight for each language model below.
# The suffix _OOV is appended to unknown source-language words if this
# is set to true.
mark-oovs = false
# The pop-limit for decoding. This determines how many hypotheses are
# considered over each span of the input.
pop-limit = 100
# How many hypotheses to output
top-n = 300
# Whether those hypotheses should be distinct strings
use-unique-nbest = true
# This is the default format of the ouput printed to STDOUT. The variables that can be
# substituted are:
# %i: the sentence number (0-indexed)
# %s: the translated sentence
# %t: the derivation tree
# %f: the feature string
# %c: the model cost
output-format = %i ||| %s ||| %f ||| %c
# When printing the trees (%t in 'output-format'), this controls whether the alignments
# are also printed.
include-align-index = false
## Feature functions and weights.
# And these are the feature functions to activate.
feature_function = OOVPenalty
feature_function = WordPenalty
tm_pt_0 0
tm_glue_0 0
OOVPenalty 1
WordPenalty -1