scripts/training/templates/thrax-phrasal.conf - joshua - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 # this is an example Thrax configuration file
 # <- this symbol indicates a comment
 # each line should be a key-value pair separated by whitespace

 ###
 ### GRAMMAR OPTIONS
 ###

 grammar     hiero   # or samt
 reverse     false
 source-is-parsed    false
 target-is-parsed    false
 # default-nt    X   # X is the default anyway

 min-rule-count 1

 # the number of reducers
 reducers 16

 # not only do these next six options have the suggested values as given
 # in Chiang's "Hierarchical Phrase-based Translation" (CL), they are also
 # Thrax's default values! You could comment them out and the resulting grammar
 # would be identical.

 # maximum length of initial phrase pairs
 initial-phrase-length   10
 lex-source-words        10
 lex-target-words        10

 # maximum number of NTs in a rule
 arity                   0

 # minimum number of aligned terminals in a rule
 lexicality              1

 # allow adjacent nonterminals on source side
 adjacent-nts    false

 # allow unaligned words at boundaries of phrases
 loose           false

 allow-abstract-rules    false
 allow-nonlexical-x      false
 allow-full-sentence-rules   false

 nonlex-source-length    5
 nonlex-target-length    5
 nonlex-source-words     5
 nonlex-target-words     5

 allow-double-plus    false

 rule-span-limit         12

 phrase-penalty  2.718

 # a whitespace seperated list of features
 # in this example, the features are phrase translation probability,
 # lexical probability, and phrase penalty
 # features        phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count

 features        e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty count

 # the only option and default
 # later we will want to a dd formats for other decoders
 # such as moses and cdec, if they use other formats
 output-format   joshua


 # label feature scores?
 # each score will be output as name=score
 label-feature-scores false

 amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
 amazon-jar  s3://edu.jhu.cs.jonny/thrax.jar
 amazon-num-instances    15

 # the format should be:
 # foreign sentence ||| english sentence ||| alignment
 # where the english is either parsed or not depending on whether you want
 # SAMT or you want Hiero.

 max-split-size  8388608
 #input-file  s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# this is an example Thrax configuration file
	# <- this symbol indicates a comment
	# each line should be a key-value pair separated by whitespace

	###
	### GRAMMAR OPTIONS
	###

	grammar hiero # or samt
	reverse false
	source-is-parsed false
	target-is-parsed false
	# default-nt X # X is the default anyway

	min-rule-count 1

	# the number of reducers
	reducers 16

	# not only do these next six options have the suggested values as given
	# in Chiang's "Hierarchical Phrase-based Translation" (CL), they are also
	# Thrax's default values! You could comment them out and the resulting grammar
	# would be identical.

	# maximum length of initial phrase pairs
	initial-phrase-length 10
	lex-source-words 10
	lex-target-words 10

	# maximum number of NTs in a rule
	arity 0

	# minimum number of aligned terminals in a rule
	lexicality 1

	# allow adjacent nonterminals on source side
	adjacent-nts false

	# allow unaligned words at boundaries of phrases
	loose false

	allow-abstract-rules false
	allow-nonlexical-x false
	allow-full-sentence-rules false

	nonlex-source-length 5
	nonlex-target-length 5
	nonlex-source-words 5
	nonlex-target-words 5

	allow-double-plus false

	rule-span-limit 12

	phrase-penalty 2.718

	# a whitespace seperated list of features
	# in this example, the features are phrase translation probability,
	# lexical probability, and phrase penalty
	# features phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count

	features e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty count

	# the only option and default
	# later we will want to a dd formats for other decoders
	# such as moses and cdec, if they use other formats
	output-format joshua


	# label feature scores?
	# each score will be output as name=score
	label-feature-scores false

	amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
	amazon-jar s3://edu.jhu.cs.jonny/thrax.jar
	amazon-num-instances 15

	# the format should be:
	# foreign sentence \|\|\| english sentence \|\|\| alignment
	# where the english is either parsed or not depending on whether you want
	# SAMT or you want Hiero.

	max-split-size 8388608
	#input-file s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en