blob: 953d9c36559d48275b30cd113f23b45f4b3e06af [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# this is an example Thrax configuration file
# <- this symbol indicates a comment
# each line should be a key-value pair separated by whitespace
###
### GRAMMAR OPTIONS
###
grammar hiero # or samt
reverse false
source-is-parsed false
target-is-parsed false
# default-nt X # X is the default anyway
min-rule-count 1
# the number of reducers
reducers 16
# not only do these next six options have the suggested values as given
# in Chiang's "Hierarchical Phrase-based Translation" (CL), they are also
# Thrax's default values! You could comment them out and the resulting grammar
# would be identical.
# maximum length of initial phrase pairs
initial-phrase-length 10
lex-source-words 10
lex-target-words 10
# maximum number of NTs in a rule
arity 0
# minimum number of aligned terminals in a rule
lexicality 1
# allow adjacent nonterminals on source side
adjacent-nts false
# allow unaligned words at boundaries of phrases
loose false
allow-abstract-rules false
allow-nonlexical-x false
allow-full-sentence-rules false
nonlex-source-length 5
nonlex-target-length 5
nonlex-source-words 5
nonlex-target-words 5
allow-double-plus false
rule-span-limit 12
phrase-penalty 2.718
# a whitespace seperated list of features
# in this example, the features are phrase translation probability,
# lexical probability, and phrase penalty
# features phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count
features e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty count
# the only option and default
# later we will want to a dd formats for other decoders
# such as moses and cdec, if they use other formats
output-format joshua
# label feature scores?
# each score will be output as name=score
label-feature-scores false
amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
amazon-jar s3://edu.jhu.cs.jonny/thrax.jar
amazon-num-instances 15
# the format should be:
# foreign sentence ||| english sentence ||| alignment
# where the english is either parsed or not depending on whether you want
# SAMT or you want Hiero.
max-split-size 8388608
#input-file s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en