blob: 997bee8828620c321613a96831f9f0b9a89653c7 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# this is an example Thrax configuration file
# <- this symbol indicates a comment
# each line should be a key-value pair separated by whitespace
###
### GRAMMAR OPTIONS
###
grammar hiero # or samt
reverse false
source-is-parsed false
target-is-parsed false
# default-nt X # X is the default anyway
min-rule-count 1
# the number of reducers
reducers 16
# Maximum length of initial phrase pairs. These are set to be shorter than
# used by Hiero.
initial-phrase-length <MAXPHRLEN>
lex-source-words 5
lex-target-words 5
# maximum number of NTs in a rule
arity 0
# minimum number of aligned terminals in a rule
lexicality 1
# allow adjacent nonterminals on source side
adjacent-nts false
# allow unaligned words at boundaries of phrases
loose true
allow-abstract-rules false
allow-nonlexical-x false
allow-full-sentence-rules false
nonlex-source-length 5
nonlex-target-length 5
nonlex-source-words 5
nonlex-target-words 5
allow-double-plus false
rule-span-limit 12
phrase-penalty 2.718
# a whitespace seperated list of features
# in this example, the features are phrase translation probability,
# lexical probability, and phrase penalty
# features phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count
features e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty alignment count
# the only option and default later we will want to add formats for other decoders such as moses and
# cdec, if they use other formats
output-format joshua
# label feature scores? each score will be output as name=score
label-feature-scores false
amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
amazon-jar s3://edu.jhu.cs.jonny/thrax.jar
amazon-num-instances 15
max-split-size 8388608
# the format should be:
# foreign sentence ||| english sentence ||| alignment
# where the english is either parsed or not depending on whether you want
# SAMT or you want Hiero.
#input-file s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en