test/decoder/fragmentlm/joshua.config - joshua - Git at Google

 # This file is a template for the Joshua pipeline; variables enclosed
 # in <angle-brackets> are substituted by the pipeline script as
 # appropriate.  This file also serves to document Joshua's many
 # parameters.

 # These are the grammar file specifications.  Joshua supports an
 # arbitrary number of grammar files, each specified on its own line
 # using the following format:
 #
 #   tm = TYPE OWNER LIMIT FILE
 #
 # TYPE is "packed", "thrax", or "samt".  The latter denotes the format
 # used in Zollmann and Venugopal's SAMT decoder
 # (http://www.cs.cmu.edu/~zollmann/samt/).
 #
 # OWNER is the "owner" of the rules in the grammar; this is used to
 # determine which set of phrasal features apply to the grammar's
 # rules.  Having different owners allows different features to be
 # applied to different grammars, and for grammars to share features
 # across files.
 #
 # LIMIT is the maximum input span permitted for the application of
 # grammar rules found in the grammar file.  A value of -1 implies no limit.
 #
 # FILE is the grammar file (or directory when using packed grammars).
 # The file can be compressed with gzip, which is determined by the
 # presence or absence of a ".gz" file extension.
 #
 # By a convention defined by Chiang (2007), the grammars are split
 # into two files: the main translation grammar containing all the
 # learned translation rules, and a glue grammar which supports
 # monotonic concatenation of hierarchical phrases. The glue grammar's
 # main distinction from the regular grammar is that the span limit
 # does not apply to it.

 tm = thrax pt 20 grammar
 tm = thrax glue -1 glue

 # This symbol is used over unknown words in the source language

 default-non-terminal = OOV

 # This is the goal nonterminal, used to determine when a complete
 # parse is found.  It should correspond to the root-level rules in the
 # glue grammar.

 goal-symbol = GOAL

 # Language model config.

 # Multiple language models are supported.  For each language model,
 # create a line in the following format,
 #
 # lm = TYPE 5 false false 100 FILE
 #
 # where the six fields correspond to the following values:
 # - LM type: one of "kenlm", "berkeleylm", "javalm" (not recommended), or "none"
 # - LM order: the N of the N-gram language model
 # - whether to use left equivalent state (currently not supported)
 # - whether to use right equivalent state (currently not supported)
 # - the ceiling cost of any n-gram (currently ignored)
 # - LM file: the location of the language model file
 # You also need to add a weight for each language model below.

 # The suffix _OOV is appended to unknown source-language words if this
 # is set to true.

 mark-oovs = false

 # The pop-limit for decoding.  This determines how many hypotheses are
 # considered over each span of the input.

 pop-limit = 100

 # How many hypotheses to output

 top-n = 300

 # Whether those hypotheses should be distinct strings

 use-unique-nbest = true

 # This is the default format of the ouput printed to STDOUT.  The variables that can be
 # substituted are:
 #
 # %i: the sentence number (0-indexed)
 # %s: the translated sentence
 # %t: the derivation tree
 # %f: the feature string
 # %c: the model cost

 output-format = %i ||| %s ||| %f ||| %c

 # When printing the trees (%t in 'output-format'), this controls whether the alignments
 # are also printed.

 include-align-index = false

 ## Feature functions and weights.

 # And these are the feature functions to activate.
 feature_function = OOVPenalty
 feature_function = WordPenalty

 tm_pt_0 0
 tm_glue_0 0

 OOVPenalty 1
 WordPenalty -1
	# This file is a template for the Joshua pipeline; variables enclosed
	# in <angle-brackets> are substituted by the pipeline script as
	# appropriate. This file also serves to document Joshua's many
	# parameters.

	# These are the grammar file specifications. Joshua supports an
	# arbitrary number of grammar files, each specified on its own line
	# using the following format:
	#
	# tm = TYPE OWNER LIMIT FILE
	#
	# TYPE is "packed", "thrax", or "samt". The latter denotes the format
	# used in Zollmann and Venugopal's SAMT decoder
	# (http://www.cs.cmu.edu/~zollmann/samt/).
	#
	# OWNER is the "owner" of the rules in the grammar; this is used to
	# determine which set of phrasal features apply to the grammar's
	# rules. Having different owners allows different features to be
	# applied to different grammars, and for grammars to share features
	# across files.
	#
	# LIMIT is the maximum input span permitted for the application of
	# grammar rules found in the grammar file. A value of -1 implies no limit.
	#
	# FILE is the grammar file (or directory when using packed grammars).
	# The file can be compressed with gzip, which is determined by the
	# presence or absence of a ".gz" file extension.
	#
	# By a convention defined by Chiang (2007), the grammars are split
	# into two files: the main translation grammar containing all the
	# learned translation rules, and a glue grammar which supports
	# monotonic concatenation of hierarchical phrases. The glue grammar's
	# main distinction from the regular grammar is that the span limit
	# does not apply to it.

	tm = thrax pt 20 grammar
	tm = thrax glue -1 glue

	# This symbol is used over unknown words in the source language

	default-non-terminal = OOV

	# This is the goal nonterminal, used to determine when a complete
	# parse is found. It should correspond to the root-level rules in the
	# glue grammar.

	goal-symbol = GOAL

	# Language model config.

	# Multiple language models are supported. For each language model,
	# create a line in the following format,
	#
	# lm = TYPE 5 false false 100 FILE
	#
	# where the six fields correspond to the following values:
	# - LM type: one of "kenlm", "berkeleylm", "javalm" (not recommended), or "none"
	# - LM order: the N of the N-gram language model
	# - whether to use left equivalent state (currently not supported)
	# - whether to use right equivalent state (currently not supported)
	# - the ceiling cost of any n-gram (currently ignored)
	# - LM file: the location of the language model file
	# You also need to add a weight for each language model below.

	# The suffix _OOV is appended to unknown source-language words if this
	# is set to true.

	mark-oovs = false

	# The pop-limit for decoding. This determines how many hypotheses are
	# considered over each span of the input.

	pop-limit = 100

	# How many hypotheses to output

	top-n = 300

	# Whether those hypotheses should be distinct strings

	use-unique-nbest = true

	# This is the default format of the ouput printed to STDOUT. The variables that can be
	# substituted are:
	#
	# %i: the sentence number (0-indexed)
	# %s: the translated sentence
	# %t: the derivation tree
	# %f: the feature string
	# %c: the model cost

	output-format = %i \|\|\| %s \|\|\| %f \|\|\| %c

	# When printing the trees (%t in 'output-format'), this controls whether the alignments
	# are also printed.

	include-align-index = false

	## Feature functions and weights.

	# And these are the feature functions to activate.
	feature_function = OOVPenalty
	feature_function = WordPenalty

	tm_pt_0 0
	tm_glue_0 0

	OOVPenalty 1
	WordPenalty -1