scripts/ems/config.hiero - joshua - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 ################################################
 ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
 ################################################

 [GENERAL]

 ### directory in which experiment is run
 #
 working-dir = /home/pkoehn/experiment

 # specification of the language pair
 input-extension = fr
 output-extension = en
 pair-extension = fr-en

 ### directories that contain tools and data
 #
 # moses
 moses-src-dir = /home/pkoehn/moses
 #
 # moses binaries
 moses-bin-dir = $moses-src-dir/bin
 #
 # moses scripts
 moses-script-dir = $moses-src-dir/scripts
 #
 # directory where GIZA++/MGIZA programs resides
 external-bin-dir = /Users/hieuhoang/workspace/bin/training-tools
 #
 # srilm
 srilm-dir = $moses-src-dir/srilm/bin/i686
 #
 # irstlm
 irstlm-dir = $moses-src-dir/irstlm/bin
 #
 # randlm
 randlm-dir = $moses-src-dir/randlm/bin
 #
 # data
 wmt12-data = $working-dir/data

 ### basic tools
 #
 # moses decoder
 decoder = $moses-bin-dir/moses_chart

 # conversion of phrase table into binary on-disk format
 #ttable-binarizer = $moses-bin-dir/processPhraseTable

 # conversion of rule table into binary on-disk format
 ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2"

 # tokenizers - comment out if all your data is already tokenized
 input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
 output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"

 # truecasers - comment out if you do not use the truecaser
 input-truecaser = $moses-script-dir/recaser/truecase.perl
 output-truecaser = $moses-script-dir/recaser/truecase.perl
 detruecaser = $moses-script-dir/recaser/detruecase.perl

 ### generic parallelizer for cluster and multi-core machines
 # you may specify a script that allows the parallel execution
 # parallizable steps (see meta file). you also need specify
 # the number of jobs (cluster) or cores (multicore)
 #
 #generic-parallelizer = $moses-script-dir/ems/support/generic-parallelizer.perl
 #generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl

 ### cluster settings (if run on a cluster machine)
 # number of jobs to be submitted in parallel
 #
 #jobs = 10

 # arguments to qsub when scheduling a job
 #qsub-settings = ""

 # project for priviledges and usage accounting
 #qsub-project = iccs_smt

 # memory and time
 #qsub-memory = 4
 #qsub-hours = 48

 ### multi-core settings
 # when the generic parallelizer is used, the number of cores
 # specified here
 cores = 8

 #################################################################
 # PARALLEL CORPUS PREPARATION:
 # create a tokenized, sentence-aligned corpus, ready for training

 [CORPUS]

 ### long sentences are filtered out, since they slow down GIZA++
 # and are a less reliable source of data. set here the maximum
 # length of a sentence
 #
 max-sentence-length = 80

 [CORPUS:europarl] IGNORE

 ### command to run to get raw corpus files
 #
 # get-corpus-script =

 ### raw corpus files (untokenized, but sentence aligned)
 #
 raw-stem = $wmt12-data/training/europarl-v7.$pair-extension

 ### tokenized corpus files (may contain long sentences)
 #
 #tokenized-stem =

 ### if sentence filtering should be skipped,
 # point to the clean training data
 #
 #clean-stem =

 ### if corpus preparation should be skipped,
 # point to the prepared training data
 #
 #lowercased-stem =

 [CORPUS:nc]
 raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension

 [CORPUS:un] IGNORE
 raw-stem = $wmt12-data/training/undoc.2000.$pair-extension

 #################################################################
 # LANGUAGE MODEL TRAINING

 [LM]

 ### tool to be used for language model training
 # kenlm training
 lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz"
 settings = "--prune '0 0 1' -T $working-dir/lm -S 20%"

 # srilm
 #lm-training = $srilm-dir/ngram-count
 #settings = "-interpolate -kndiscount -unk"

 # irstlm training
 # msb = modified kneser ney; p=0 no singleton pruning
 #lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
 #settings = "-s msb -p 0"

 # order of the language model
 order = 5

 ### tool to be used for training randomized language model from scratch
 # (more commonly, a SRILM is trained)
 #
 #rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
 #lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
 lm-binarizer = $moses-bin-dir/build_binary
 type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 #
 #lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
 #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 ### each language model to be used has its own section here

 [LM:europarl] IGNORE

 ### command to run to get raw corpus files
 #
 #get-corpus-script = ""

 ### raw corpus (untokenized)
 #
 raw-corpus = $wmt12-data/training/europarl-v7.$output-extension

 ### tokenized corpus files (may contain long sentences)
 #
 #tokenized-corpus =

 ### if corpus preparation should be skipped,
 # point to the prepared language model
 #
 #lm =

 [LM:nc]
 raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension

 [LM:un] IGNORE
 raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension

 [LM:news] IGNORE
 raw-corpus = $wmt12-data/training/news.$output-extension.shuffled


 #################################################################
 # INTERPOLATING LANGUAGE MODELS

 [INTERPOLATED-LM] IGNORE

 # if multiple language models are used, these may be combined
 # by optimizing perplexity on a tuning set
 # see, for instance [Koehn and Schwenk, IJCNLP 2008]

 ### script to interpolate language models
 # if commented out, no interpolation is performed
 #
 script = $moses-script-dir/ems/support/interpolate-lm.perl

 ### tuning set
 # you may use the same set that is used for mert tuning (reference set)
 #
 tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
 #raw-tuning =
 #tokenized-tuning =
 #factored-tuning =
 #lowercased-tuning =
 #split-tuning =

 ### group language models for hierarchical interpolation
 # (flat interpolation is limited to 10 language models)
 #group = "first,second fourth,fifth"

 ### script to use for binary table format for irstlm or kenlm
 # (default: no binarization)

 # irstlm
 #lm-binarizer = $irstlm-dir/compile-lm

 # kenlm, also set type to 8
 lm-binarizer = $moses-bin-dir/build_binary
 type = 8

 ### script to create quantized language model format (irstlm)
 # (default: no quantization)
 #
 #lm-quantizer = $irstlm-dir/quantize-lm

 ### script to use for converting into randomized table format
 # (default: no randomization)
 #
 #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

 #################################################################
 # MODIFIED MOORE LEWIS FILTERING

 [MML] IGNORE

 ### specifications for language models to be trained
 #
 #lm-training = $srilm-dir/ngram-count
 #lm-settings = "-interpolate -kndiscount -unk"
 #lm-binarizer = $moses-src-dir/bin/build_binary
 #lm-query = $moses-src-dir/bin/query
 #order = 5

 ### in-/out-of-domain source/target corpora to train the 4 language model
 #
 # in-domain: point either to a parallel corpus
 #outdomain-stem = [CORPUS:toy:clean-split-stem]

 # ... or to two separate monolingual corpora
 #indomain-target = [LM:toy:lowercased-corpus]
 #raw-indomain-source = $toy-data/nc-5k.$input-extension

 # point to out-of-domain parallel corpus
 #outdomain-stem = [CORPUS:giga:clean-split-stem]

 # settings: number of lines sampled from the corpora to train each language model on
 # (if used at all, should be small as a percentage of corpus)
 #settings = "--line-count 100000"

 #################################################################
 # TRANSLATION MODEL TRAINING

 [TRAINING]

 ### training script to be used: either a legacy script or
 # current moses training script (default)
 #
 script = $moses-script-dir/training/train-model.perl

 ### general options
 # these are options that are passed on to train-model.perl, for instance
 # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
 # * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
 # * "-sort-parallel 8 -cores 8" to speed up phrase table building
 # * "-parallel" for parallel execution of mkcls and giza
 #
 #training-options = ""

 ### factored training: specify here which factors used
 # if none specified, single factor training is assumed
 # (one translation step, surface to surface)
 #
 #input-factors = word lemma pos morph
 #output-factors = word lemma pos
 #alignment-factors = "word -> word"
 #translation-factors = "word -> word"
 #reordering-factors = "word -> word"
 #generation-factors = "word -> pos"
 #decoding-steps = "t0, g0"

 ### parallelization of data preparation step
 # the two directions of the data preparation can be run in parallel
 # comment out if not needed
 #
 parallel = yes

 ### pre-computation for giza++
 # giza++ has a more efficient data structure that needs to be
 # initialized with snt2cooc. if run in parallel, this may reduces
 # memory requirements. set here the number of parts
 #
 #run-giza-in-parts = 5

 ### symmetrization method to obtain word alignments from giza output
 # (commonly used: grow-diag-final-and)
 #
 alignment-symmetrization-method = grow-diag-final-and

 ### use of Chris Dyer's fast align for word alignment
 #
 #fast-align-settings = "-d -o -v"

 ### use of berkeley aligner for word alignment
 #
 #use-berkeley = true
 #alignment-symmetrization-method = berkeley
 #berkeley-train = $moses-script-dir/ems/support/berkeley-train.sh
 #berkeley-process =  $moses-script-dir/ems/support/berkeley-process.sh
 #berkeley-jar = /your/path/to/berkeleyaligner-1.1/berkeleyaligner.jar
 #berkeley-java-options = "-server -mx30000m -ea"
 #berkeley-training-options = "-Main.iters 5 5 -EMWordAligner.numThreads 8"
 #berkeley-process-options = "-EMWordAligner.numThreads 8"
 #berkeley-posterior = 0.5

 ### use of baseline alignment model (incremental training)
 #
 #baseline = 68
 #baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
 #  $working-dir/training/prepared.$baseline/$output-extension.vcb \
 #  $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
 #  $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
 #  $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
 #  $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
 #  $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
 #  $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"

 ### if word alignment should be skipped,
 # point to word alignment files
 #
 #word-alignment = $working-dir/model/aligned.1

 ### filtering some corpora with modified Moore-Lewis
 # specify corpora to be filtered and ratio to be kept, either before or after word alignment
 #mml-filter-corpora = toy
 #mml-before-wa = "-proportion 0.9"
 #mml-after-wa = "-proportion 0.9"

 ### build memory mapped suffix array phrase table
 # (binarizing the reordering table is a good idea, since filtering makes little sense)
 #mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1"
 #binarize-all = $moses-script-dir/training/binarize-model.perl

 ### create a bilingual concordancer for the model
 #
 #biconcor = $moses-bin-dir/biconcor

 ## Operation Sequence Model  (OSM)
 # Durrani, Schmid and Fraser. (2011):
 # "A Joint Sequence Translation Model with Integrated Reordering"
 # compile Moses with --max-kenlm-order=9 if higher order is required
 #
 #operation-sequence-model = "yes"
 #operation-sequence-model-order = 5
 #operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
 #
 # if OSM training should be skipped, point to OSM Model
 #osm-model =

 ### unsupervised transliteration module
 # Durrani, Sajjad, Hoang and Koehn (EACL, 2014).
 # "Integrating an Unsupervised Transliteration Model
 # into Statistical Machine Translation."
 #
 #transliteration-module = "yes"
 #post-decoding-transliteration = "yes"

 ### lexicalized reordering: specify orientation type
 # (default: only distance-based reordering model)
 #
 #lexicalized-reordering = msd-bidirectional-fe

 ### hierarchical rule set
 #
 hierarchical-rule-set = true

 ### settings for rule extraction
 #
 #extract-settings = ""

 ### add extracted phrases from baseline model
 #
 #baseline-extract = $working-dir/model/extract.$baseline
 #
 # requires aligned parallel corpus for re-estimating lexical translation probabilities
 #baseline-corpus = $working-dir/training/corpus.$baseline
 #baseline-alignment = $working-dir/model/aligned.$baseline.$alignment-symmetrization-method

 ### unknown word labels (target syntax only)
 # enables use of unknown word labels during decoding
 # label file is generated during rule extraction
 #
 #use-unknown-word-labels = true

 ### if phrase extraction should be skipped,
 # point to stem for extract files
 #
 # extracted-phrases =

 ### settings for rule scoring
 #
 score-settings = "--GoodTuring"

 ### include word alignment in phrase table
 #
 #include-word-alignment-in-rules = yes

 ### sparse lexical features
 #
 #sparse-features = "target-word-insertion top 50, source-word-deletion top 50, word-translation top 50 50, phrase-length"

 ### domain adaptation settings
 # options: sparse, any of: indicator, subset, ratio
 #domain-features = "subset"

 ### if phrase table training should be skipped,
 # point to phrase translation table
 #
 # phrase-translation-table =

 ### if reordering table training should be skipped,
 # point to reordering table
 #
 # reordering-table =

 ### filtering the phrase table based on significance tests
 # Johnson, Martin, Foster and Kuhn. (2007): "Improving Translation Quality by Discarding Most of the Phrasetable"
 # options: -n number of translations; -l 'a+e', 'a-e', or a positive real value -log prob threshold
 #salm-index = /path/to/project/salm/Bin/Linux/Index/IndexSA.O64
 #sigtest-filter = "-l a+e -n 50"

 ### if training should be skipped,
 # point to a configuration file that contains
 # pointers to all relevant model files
 #
 #config-with-reused-weights =

 #####################################################
 ### TUNING: finding good weights for model components

 [TUNING]

 ### instead of tuning with this setting, old weights may be recycled
 # specify here an old configuration file with matching weights
 #
 #weight-config = $working-dir/tuning/moses.weight-reused.ini.1

 ### tuning script to be used
 #
 tuning-script = $moses-script-dir/training/mert-moses.pl
 tuning-settings = "-mertdir $moses-bin-dir"

 ### specify the corpus used for tuning
 # it should contain 1000s of sentences
 #
 input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
 #raw-input =
 #tokenized-input =
 #factorized-input =
 #input =
 #
 reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
 #raw-reference =
 #tokenized-reference =
 #factorized-reference =
 #reference =

 ### size of n-best list used (typically 100)
 #
 nbest = 100

 ### ranges for weights for random initialization
 # if not specified, the tuning script will use generic ranges
 # it is not clear, if this matters
 #
 # lambda =

 ### additional flags for the filter script
 #
 filter-settings = ""

 ### additional flags for the decoder
 #
 decoder-settings = ""

 ### if tuning should be skipped, specify this here
 # and also point to a configuration file that contains
 # pointers to all relevant model files
 #
 #config =

 #########################################################
 ## RECASER: restore case, this part only trains the model

 [RECASING]

 #decoder = $moses-bin-dir/moses

 ### training data
 # raw input needs to be still tokenized,
 # also also tokenized input may be specified
 #
 #tokenized = [LM:europarl:tokenized-corpus]

 # recase-config =

 #lm-training = $srilm-dir/ngram-count

 #######################################################
 ## TRUECASER: train model to truecase corpora and input

 [TRUECASER]

 ### script to train truecaser models
 #
 trainer = $moses-script-dir/recaser/train-truecaser.perl

 ### training data
 # data on which truecaser is trained
 # if no training data is specified, parallel corpus is used
 #
 # raw-stem =
 # tokenized-stem =

 ### trained model
 #
 # truecase-model =

 ######################################################################
 ## EVALUATION: translating a test set using the tuned system and score it

 [EVALUATION]

 ### number of jobs (if parallel execution on cluster)
 #
 #jobs = 10

 ### additional flags for the filter script
 #
 #filter-settings = ""

 ### additional decoder settings
 # switches for the Moses decoder
 # common choices:
 #   "-threads N" for multi-threading
 #   "-mbr" for MBR decoding
 #   "-drop-unknown" for dropping unknown source words
 #   "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
 #
 #decoder-settings = ""

 ### specify size of n-best list, if produced
 #
 #nbest = 100

 ### multiple reference translations
 #
 #multiref = yes

 ### prepare system output for scoring
 # this may include detokenization and wrapping output in sgm
 # (needed for nist-bleu, ter, meteor)
 #
 detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension"
 #recaser = $moses-script-dir/recaser/recase.perl
 wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
 #output-sgm =

 ### BLEU
 #
 nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
 nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
 #multi-bleu = "$moses-script-dir/generic/multi-bleu.perl -lc"
 #multi-bleu-c = $moses-script-dir/generic/multi-bleu.perl
 #ibm-bleu =

 ### TER: translation error rate (BBN metric) based on edit distance
 # not yet integrated
 #
 # ter =

 ### METEOR: gives credit to stem / worknet synonym matches
 # not yet integrated
 #
 # meteor =

 ### Analysis: carry out various forms of analysis on the output
 #
 analysis = $moses-script-dir/ems/support/analysis.perl
 #
 # also report on input coverage
 analyze-coverage = yes
 #
 # also report on phrase mappings used
 report-segmentation = yes
 #
 # report precision of translations for each input word, broken down by
 # count of input word in corpus and model
 #report-precision-by-coverage = yes
 #
 # further precision breakdown by factor
 #precision-by-coverage-factor = pos
 #
 # visualization of the search graph in tree-based models
 #analyze-search-graph = yes

 [EVALUATION:newstest2011]

 ### input data
 #
 input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
 # raw-input =
 # tokenized-input =
 # factorized-input =
 # input =

 ### reference data
 #
 reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
 # raw-reference =
 # tokenized-reference =
 # reference =

 ### analysis settings
 # may contain any of the general evaluation analysis settings
 # specific setting: base coverage statistics on earlier run
 #
 #precision-by-coverage-base = $working-dir/evaluation/test.analysis.5

 ### wrapping frame
 # for nist-bleu and other scoring scripts, the output needs to be wrapped
 # in sgm markup (typically like the input sgm)
 #
 wrapping-frame = $input-sgm

 ##########################################
 ### REPORTING: summarize evaluation scores

 [REPORTING]

 ### currently no parameters for reporting section
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	################################################
	### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
	################################################

	[GENERAL]

	### directory in which experiment is run
	#
	working-dir = /home/pkoehn/experiment

	# specification of the language pair
	input-extension = fr
	output-extension = en
	pair-extension = fr-en

	### directories that contain tools and data
	#
	# moses
	moses-src-dir = /home/pkoehn/moses
	#
	# moses binaries
	moses-bin-dir = $moses-src-dir/bin
	#
	# moses scripts
	moses-script-dir = $moses-src-dir/scripts
	#
	# directory where GIZA++/MGIZA programs resides
	external-bin-dir = /Users/hieuhoang/workspace/bin/training-tools
	#
	# srilm
	srilm-dir = $moses-src-dir/srilm/bin/i686
	#
	# irstlm
	irstlm-dir = $moses-src-dir/irstlm/bin
	#
	# randlm
	randlm-dir = $moses-src-dir/randlm/bin
	#
	# data
	wmt12-data = $working-dir/data

	### basic tools
	#
	# moses decoder
	decoder = $moses-bin-dir/moses_chart

	# conversion of phrase table into binary on-disk format
	#ttable-binarizer = $moses-bin-dir/processPhraseTable

	# conversion of rule table into binary on-disk format
	ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2"

	# tokenizers - comment out if all your data is already tokenized
	input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
	output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"

	# truecasers - comment out if you do not use the truecaser
	input-truecaser = $moses-script-dir/recaser/truecase.perl
	output-truecaser = $moses-script-dir/recaser/truecase.perl
	detruecaser = $moses-script-dir/recaser/detruecase.perl

	### generic parallelizer for cluster and multi-core machines
	# you may specify a script that allows the parallel execution
	# parallizable steps (see meta file). you also need specify
	# the number of jobs (cluster) or cores (multicore)
	#
	#generic-parallelizer = $moses-script-dir/ems/support/generic-parallelizer.perl
	#generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl

	### cluster settings (if run on a cluster machine)
	# number of jobs to be submitted in parallel
	#
	#jobs = 10

	# arguments to qsub when scheduling a job
	#qsub-settings = ""

	# project for priviledges and usage accounting
	#qsub-project = iccs_smt

	# memory and time
	#qsub-memory = 4
	#qsub-hours = 48

	### multi-core settings
	# when the generic parallelizer is used, the number of cores
	# specified here
	cores = 8

	#################################################################
	# PARALLEL CORPUS PREPARATION:
	# create a tokenized, sentence-aligned corpus, ready for training

	[CORPUS]

	### long sentences are filtered out, since they slow down GIZA++
	# and are a less reliable source of data. set here the maximum
	# length of a sentence
	#
	max-sentence-length = 80

	[CORPUS:europarl] IGNORE

	### command to run to get raw corpus files
	#
	# get-corpus-script =

	### raw corpus files (untokenized, but sentence aligned)
	#
	raw-stem = $wmt12-data/training/europarl-v7.$pair-extension

	### tokenized corpus files (may contain long sentences)
	#
	#tokenized-stem =

	### if sentence filtering should be skipped,
	# point to the clean training data
	#
	#clean-stem =

	### if corpus preparation should be skipped,
	# point to the prepared training data
	#
	#lowercased-stem =

	[CORPUS:nc]
	raw-stem = $wmt12-data/training/news-commentary-v7.$pair-extension

	[CORPUS:un] IGNORE
	raw-stem = $wmt12-data/training/undoc.2000.$pair-extension

	#################################################################
	# LANGUAGE MODEL TRAINING

	[LM]

	### tool to be used for language model training
	# kenlm training
	lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz"
	settings = "--prune '0 0 1' -T $working-dir/lm -S 20%"

	# srilm
	#lm-training = $srilm-dir/ngram-count
	#settings = "-interpolate -kndiscount -unk"

	# irstlm training
	# msb = modified kneser ney; p=0 no singleton pruning
	#lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
	#settings = "-s msb -p 0"

	# order of the language model
	order = 5

	### tool to be used for training randomized language model from scratch
	# (more commonly, a SRILM is trained)
	#
	#rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8"

	### script to use for binary table format for irstlm or kenlm
	# (default: no binarization)

	# irstlm
	#lm-binarizer = $irstlm-dir/compile-lm

	# kenlm, also set type to 8
	lm-binarizer = $moses-bin-dir/build_binary
	type = 8

	### script to create quantized language model format (irstlm)
	# (default: no quantization)
	#
	#lm-quantizer = $irstlm-dir/quantize-lm

	### script to use for converting into randomized table format
	# (default: no randomization)
	#
	#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

	### each language model to be used has its own section here

	[LM:europarl] IGNORE

	### command to run to get raw corpus files
	#
	#get-corpus-script = ""

	### raw corpus (untokenized)
	#
	raw-corpus = $wmt12-data/training/europarl-v7.$output-extension

	### tokenized corpus files (may contain long sentences)
	#
	#tokenized-corpus =

	### if corpus preparation should be skipped,
	# point to the prepared language model
	#
	#lm =

	[LM:nc]
	raw-corpus = $wmt12-data/training/news-commentary-v7.$pair-extension.$output-extension

	[LM:un] IGNORE
	raw-corpus = $wmt12-data/training/undoc.2000.$pair-extension.$output-extension

	[LM:news] IGNORE
	raw-corpus = $wmt12-data/training/news.$output-extension.shuffled


	#################################################################
	# INTERPOLATING LANGUAGE MODELS

	[INTERPOLATED-LM] IGNORE

	# if multiple language models are used, these may be combined
	# by optimizing perplexity on a tuning set
	# see, for instance [Koehn and Schwenk, IJCNLP 2008]

	### script to interpolate language models
	# if commented out, no interpolation is performed
	#
	script = $moses-script-dir/ems/support/interpolate-lm.perl

	### tuning set
	# you may use the same set that is used for mert tuning (reference set)
	#
	tuning-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
	#raw-tuning =
	#tokenized-tuning =
	#factored-tuning =
	#lowercased-tuning =
	#split-tuning =

	### group language models for hierarchical interpolation
	# (flat interpolation is limited to 10 language models)
	#group = "first,second fourth,fifth"

	### script to use for binary table format for irstlm or kenlm
	# (default: no binarization)

	# irstlm
	#lm-binarizer = $irstlm-dir/compile-lm

	# kenlm, also set type to 8
	lm-binarizer = $moses-bin-dir/build_binary
	type = 8

	### script to create quantized language model format (irstlm)
	# (default: no quantization)
	#
	#lm-quantizer = $irstlm-dir/quantize-lm

	### script to use for converting into randomized table format
	# (default: no randomization)
	#
	#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"

	#################################################################
	# MODIFIED MOORE LEWIS FILTERING

	[MML] IGNORE

	### specifications for language models to be trained
	#
	#lm-training = $srilm-dir/ngram-count
	#lm-settings = "-interpolate -kndiscount -unk"
	#lm-binarizer = $moses-src-dir/bin/build_binary
	#lm-query = $moses-src-dir/bin/query
	#order = 5

	### in-/out-of-domain source/target corpora to train the 4 language model
	#
	# in-domain: point either to a parallel corpus
	#outdomain-stem = [CORPUS:toy:clean-split-stem]

	# ... or to two separate monolingual corpora
	#indomain-target = [LM:toy:lowercased-corpus]
	#raw-indomain-source = $toy-data/nc-5k.$input-extension

	# point to out-of-domain parallel corpus
	#outdomain-stem = [CORPUS:giga:clean-split-stem]

	# settings: number of lines sampled from the corpora to train each language model on
	# (if used at all, should be small as a percentage of corpus)
	#settings = "--line-count 100000"

	#################################################################
	# TRANSLATION MODEL TRAINING

	[TRAINING]

	### training script to be used: either a legacy script or
	# current moses training script (default)
	#
	script = $moses-script-dir/training/train-model.perl

	### general options
	# these are options that are passed on to train-model.perl, for instance
	# * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza
	# * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting
	# * "-sort-parallel 8 -cores 8" to speed up phrase table building
	# * "-parallel" for parallel execution of mkcls and giza
	#
	#training-options = ""

	### factored training: specify here which factors used
	# if none specified, single factor training is assumed
	# (one translation step, surface to surface)
	#
	#input-factors = word lemma pos morph
	#output-factors = word lemma pos
	#alignment-factors = "word -> word"
	#translation-factors = "word -> word"
	#reordering-factors = "word -> word"
	#generation-factors = "word -> pos"
	#decoding-steps = "t0, g0"

	### parallelization of data preparation step
	# the two directions of the data preparation can be run in parallel
	# comment out if not needed
	#
	parallel = yes

	### pre-computation for giza++
	# giza++ has a more efficient data structure that needs to be
	# initialized with snt2cooc. if run in parallel, this may reduces
	# memory requirements. set here the number of parts
	#
	#run-giza-in-parts = 5

	### symmetrization method to obtain word alignments from giza output
	# (commonly used: grow-diag-final-and)
	#
	alignment-symmetrization-method = grow-diag-final-and

	### use of Chris Dyer's fast align for word alignment
	#
	#fast-align-settings = "-d -o -v"

	### use of berkeley aligner for word alignment
	#
	#use-berkeley = true
	#alignment-symmetrization-method = berkeley
	#berkeley-train = $moses-script-dir/ems/support/berkeley-train.sh
	#berkeley-process = $moses-script-dir/ems/support/berkeley-process.sh
	#berkeley-jar = /your/path/to/berkeleyaligner-1.1/berkeleyaligner.jar
	#berkeley-java-options = "-server -mx30000m -ea"
	#berkeley-training-options = "-Main.iters 5 5 -EMWordAligner.numThreads 8"
	#berkeley-process-options = "-EMWordAligner.numThreads 8"
	#berkeley-posterior = 0.5

	### use of baseline alignment model (incremental training)
	#
	#baseline = 68
	#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
	# $working-dir/training/prepared.$baseline/$output-extension.vcb \
	# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
	# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
	# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
	# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
	# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
	# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"

	### if word alignment should be skipped,
	# point to word alignment files
	#
	#word-alignment = $working-dir/model/aligned.1

	### filtering some corpora with modified Moore-Lewis
	# specify corpora to be filtered and ratio to be kept, either before or after word alignment
	#mml-filter-corpora = toy
	#mml-before-wa = "-proportion 0.9"
	#mml-after-wa = "-proportion 0.9"

	### build memory mapped suffix array phrase table
	# (binarizing the reordering table is a good idea, since filtering makes little sense)
	#mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1"
	#binarize-all = $moses-script-dir/training/binarize-model.perl

	### create a bilingual concordancer for the model
	#
	#biconcor = $moses-bin-dir/biconcor

	## Operation Sequence Model (OSM)
	# Durrani, Schmid and Fraser. (2011):
	# "A Joint Sequence Translation Model with Integrated Reordering"
	# compile Moses with --max-kenlm-order=9 if higher order is required
	#
	#operation-sequence-model = "yes"
	#operation-sequence-model-order = 5
	#operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40% -T $working-dir/model/tmp'"
	#
	# if OSM training should be skipped, point to OSM Model
	#osm-model =

	### unsupervised transliteration module
	# Durrani, Sajjad, Hoang and Koehn (EACL, 2014).
	# "Integrating an Unsupervised Transliteration Model
	# into Statistical Machine Translation."
	#
	#transliteration-module = "yes"
	#post-decoding-transliteration = "yes"

	### lexicalized reordering: specify orientation type
	# (default: only distance-based reordering model)
	#
	#lexicalized-reordering = msd-bidirectional-fe

	### hierarchical rule set
	#
	hierarchical-rule-set = true

	### settings for rule extraction
	#
	#extract-settings = ""

	### add extracted phrases from baseline model
	#
	#baseline-extract = $working-dir/model/extract.$baseline
	#
	# requires aligned parallel corpus for re-estimating lexical translation probabilities
	#baseline-corpus = $working-dir/training/corpus.$baseline
	#baseline-alignment = $working-dir/model/aligned.$baseline.$alignment-symmetrization-method

	### unknown word labels (target syntax only)
	# enables use of unknown word labels during decoding
	# label file is generated during rule extraction
	#
	#use-unknown-word-labels = true

	### if phrase extraction should be skipped,
	# point to stem for extract files
	#
	# extracted-phrases =

	### settings for rule scoring
	#
	score-settings = "--GoodTuring"

	### include word alignment in phrase table
	#
	#include-word-alignment-in-rules = yes

	### sparse lexical features
	#
	#sparse-features = "target-word-insertion top 50, source-word-deletion top 50, word-translation top 50 50, phrase-length"

	### domain adaptation settings
	# options: sparse, any of: indicator, subset, ratio
	#domain-features = "subset"

	### if phrase table training should be skipped,
	# point to phrase translation table
	#
	# phrase-translation-table =

	### if reordering table training should be skipped,
	# point to reordering table
	#
	# reordering-table =

	### filtering the phrase table based on significance tests
	# Johnson, Martin, Foster and Kuhn. (2007): "Improving Translation Quality by Discarding Most of the Phrasetable"
	# options: -n number of translations; -l 'a+e', 'a-e', or a positive real value -log prob threshold
	#salm-index = /path/to/project/salm/Bin/Linux/Index/IndexSA.O64
	#sigtest-filter = "-l a+e -n 50"

	### if training should be skipped,
	# point to a configuration file that contains
	# pointers to all relevant model files
	#
	#config-with-reused-weights =

	#####################################################
	### TUNING: finding good weights for model components

	[TUNING]

	### instead of tuning with this setting, old weights may be recycled
	# specify here an old configuration file with matching weights
	#
	#weight-config = $working-dir/tuning/moses.weight-reused.ini.1

	### tuning script to be used
	#
	tuning-script = $moses-script-dir/training/mert-moses.pl
	tuning-settings = "-mertdir $moses-bin-dir"

	### specify the corpus used for tuning
	# it should contain 1000s of sentences
	#
	input-sgm = $wmt12-data/dev/newstest2010-src.$input-extension.sgm
	#raw-input =
	#tokenized-input =
	#factorized-input =
	#input =
	#
	reference-sgm = $wmt12-data/dev/newstest2010-ref.$output-extension.sgm
	#raw-reference =
	#tokenized-reference =
	#factorized-reference =
	#reference =

	### size of n-best list used (typically 100)
	#
	nbest = 100

	### ranges for weights for random initialization
	# if not specified, the tuning script will use generic ranges
	# it is not clear, if this matters
	#
	# lambda =

	### additional flags for the filter script
	#
	filter-settings = ""

	### additional flags for the decoder
	#
	decoder-settings = ""

	### if tuning should be skipped, specify this here
	# and also point to a configuration file that contains
	# pointers to all relevant model files
	#
	#config =

	#########################################################
	## RECASER: restore case, this part only trains the model

	[RECASING]

	#decoder = $moses-bin-dir/moses

	### training data
	# raw input needs to be still tokenized,
	# also also tokenized input may be specified
	#
	#tokenized = [LM:europarl:tokenized-corpus]

	# recase-config =

	#lm-training = $srilm-dir/ngram-count

	#######################################################
	## TRUECASER: train model to truecase corpora and input

	[TRUECASER]

	### script to train truecaser models
	#
	trainer = $moses-script-dir/recaser/train-truecaser.perl

	### training data
	# data on which truecaser is trained
	# if no training data is specified, parallel corpus is used
	#
	# raw-stem =
	# tokenized-stem =

	### trained model
	#
	# truecase-model =

	######################################################################
	## EVALUATION: translating a test set using the tuned system and score it

	[EVALUATION]

	### number of jobs (if parallel execution on cluster)
	#
	#jobs = 10

	### additional flags for the filter script
	#
	#filter-settings = ""

	### additional decoder settings
	# switches for the Moses decoder
	# common choices:
	# "-threads N" for multi-threading
	# "-mbr" for MBR decoding
	# "-drop-unknown" for dropping unknown source words
	# "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning
	#
	#decoder-settings = ""

	### specify size of n-best list, if produced
	#
	#nbest = 100

	### multiple reference translations
	#
	#multiref = yes

	### prepare system output for scoring
	# this may include detokenization and wrapping output in sgm
	# (needed for nist-bleu, ter, meteor)
	#
	detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension"
	#recaser = $moses-script-dir/recaser/recase.perl
	wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
	#output-sgm =

	### BLEU
	#
	nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
	nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
	#multi-bleu = "$moses-script-dir/generic/multi-bleu.perl -lc"
	#multi-bleu-c = $moses-script-dir/generic/multi-bleu.perl
	#ibm-bleu =

	### TER: translation error rate (BBN metric) based on edit distance
	# not yet integrated
	#
	# ter =

	### METEOR: gives credit to stem / worknet synonym matches
	# not yet integrated
	#
	# meteor =

	### Analysis: carry out various forms of analysis on the output
	#
	analysis = $moses-script-dir/ems/support/analysis.perl
	#
	# also report on input coverage
	analyze-coverage = yes
	#
	# also report on phrase mappings used
	report-segmentation = yes
	#
	# report precision of translations for each input word, broken down by
	# count of input word in corpus and model
	#report-precision-by-coverage = yes
	#
	# further precision breakdown by factor
	#precision-by-coverage-factor = pos
	#
	# visualization of the search graph in tree-based models
	#analyze-search-graph = yes

	[EVALUATION:newstest2011]

	### input data
	#
	input-sgm = $wmt12-data/dev/newstest2011-src.$input-extension.sgm
	# raw-input =
	# tokenized-input =
	# factorized-input =
	# input =

	### reference data
	#
	reference-sgm = $wmt12-data/dev/newstest2011-ref.$output-extension.sgm
	# raw-reference =
	# tokenized-reference =
	# reference =

	### analysis settings
	# may contain any of the general evaluation analysis settings
	# specific setting: base coverage statistics on earlier run
	#
	#precision-by-coverage-base = $working-dir/evaluation/test.analysis.5

	### wrapping frame
	# for nist-bleu and other scoring scripts, the output needs to be wrapped
	# in sgm markup (typically like the input sgm)
	#
	wrapping-frame = $input-sgm

	##########################################
	### REPORTING: summarize evaluation scores

	[REPORTING]

	### currently no parameters for reporting section