scripts/training/templates/thrax-phrase.conf - joshua - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 # this is an example Thrax configuration file
 # <- this symbol indicates a comment
 # each line should be a key-value pair separated by whitespace

 ###
 ### GRAMMAR OPTIONS
 ###

 grammar     hiero   # or samt
 reverse     false
 source-is-parsed    false
 target-is-parsed    false
 # default-nt    X   # X is the default anyway

 min-rule-count 1

 # the number of reducers
 reducers 16

 # Maximum length of initial phrase pairs. These are set to be shorter than
 # used by Hiero.
 initial-phrase-length   <MAXPHRLEN>
 lex-source-words        5
 lex-target-words        5

 # maximum number of NTs in a rule
 arity                   0

 # minimum number of aligned terminals in a rule
 lexicality              1

 # allow adjacent nonterminals on source side
 adjacent-nts    false

 # allow unaligned words at boundaries of phrases
 loose           true

 allow-abstract-rules    false
 allow-nonlexical-x      false
 allow-full-sentence-rules   false

 nonlex-source-length    5
 nonlex-target-length    5
 nonlex-source-words     5
 nonlex-target-words     5

 allow-double-plus    false

 rule-span-limit         12

 phrase-penalty  2.718

 # a whitespace seperated list of features
 # in this example, the features are phrase translation probability,
 # lexical probability, and phrase penalty
 # features        phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count
 features        e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty alignment count

 # the only option and default later we will want to add formats for other decoders such as moses and
 # cdec, if they use other formats
 output-format   joshua

 # label feature scores? each score will be output as name=score
 label-feature-scores false

 amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
 amazon-jar  s3://edu.jhu.cs.jonny/thrax.jar
 amazon-num-instances    15

 max-split-size  8388608

 # the format should be:
 # foreign sentence ||| english sentence ||| alignment
 # where the english is either parsed or not depending on whether you want
 # SAMT or you want Hiero.
 #input-file  s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# this is an example Thrax configuration file
	# <- this symbol indicates a comment
	# each line should be a key-value pair separated by whitespace

	###
	### GRAMMAR OPTIONS
	###

	grammar hiero # or samt
	reverse false
	source-is-parsed false
	target-is-parsed false
	# default-nt X # X is the default anyway

	min-rule-count 1

	# the number of reducers
	reducers 16

	# Maximum length of initial phrase pairs. These are set to be shorter than
	# used by Hiero.
	initial-phrase-length <MAXPHRLEN>
	lex-source-words 5
	lex-target-words 5

	# maximum number of NTs in a rule
	arity 0

	# minimum number of aligned terminals in a rule
	lexicality 1

	# allow adjacent nonterminals on source side
	adjacent-nts false

	# allow unaligned words at boundaries of phrases
	loose true

	allow-abstract-rules false
	allow-nonlexical-x false
	allow-full-sentence-rules false

	nonlex-source-length 5
	nonlex-target-length 5
	nonlex-source-words 5
	nonlex-target-words 5

	allow-double-plus false

	rule-span-limit 12

	phrase-penalty 2.718

	# a whitespace seperated list of features
	# in this example, the features are phrase translation probability,
	# lexical probability, and phrase penalty
	# features phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count
	features e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty alignment count

	# the only option and default later we will want to add formats for other decoders such as moses and
	# cdec, if they use other formats
	output-format joshua

	# label feature scores? each score will be output as name=score
	label-feature-scores false

	amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
	amazon-jar s3://edu.jhu.cs.jonny/thrax.jar
	amazon-num-instances 15

	max-split-size 8388608

	# the format should be:
	# foreign sentence \|\|\| english sentence \|\|\| alignment
	# where the english is either parsed or not depending on whether you want
	# SAMT or you want Hiero.
	#input-file s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en