blob: 38059fdb4ddca120953d5a6547fadfb92e2f22eb [file] [log] [blame]
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
Runs the Z-MERT and PRO tuners.
from __future__ import print_function
import argparse
from collections import namedtuple
import logging
import os
import shutil
import signal
import stat
from subprocess import CalledProcessError, Popen, PIPE, check_output, call
import sys
import re
JOSHUA = os.environ.get('JOSHUA')
EXAMPLE = r"""
Example invocation:
$JOSHUA/scripts/support/ \
/path/to/source.txt \
/path/to/reference.en \
--tuner zmert \
--tunedir working-dir \
--decoder /path/to/decoder/command \
--decoder-output /path/to/decoder/nbest/output \
--decoder-config /path/to/joshua.config
--tuner can be one of zmert or pro. If the path to the reference is a prefix
with ".0", ".1", etc extensions, they are treated as multiple references
(extensions "0", "1", etc also works --- i.e., the path to the reference can
have a trailing period). The decoder command should decode your source file and
produce output at the --decoder-output location in the Joshua n-best format, e.g.,
0 ||| example candidate translation ||| tm_pt_0=1 lm_0=17 ||| -34.2
ZMERT_CONFIG_TEMPLATE = """### MERT parameters
# target sentences file name (in this case, file name prefix)
-r <REF>
-rps <NUMREFS> # references per sentence
-p <TUNEDIR>/params.txt # parameter file
-m <METRIC> # evaluation metric and its options
-maxIt <ITERATIONS> # maximum MERT iterations
-ipi 20 # number of intermediate initial points per iteration
-cmd <DECODER_COMMAND> # file containing commands to run decoder
-decOut <DECODER_OUTPUT> # file produced by decoder
-dcfg <DECODER_CONFIG> # decoder config file
-N 300 # size of N-best list
-v 1 # verbosity level (0-2; higher value => more verbose)
PRO_CONFIG_TEMPLATE = """### Part 1: parameters similar to Z-MERT
# target sentences file name (in this case, file name prefix)
-r <REF>
# references per sentence
-rps <NUMREFS>
# parameter file
-p <TUNEDIR>/params.txt
#metric setting:
#-m TER nocase punc 5 5 joshua/zmert/tercom-0.7.25/tercom.7.25.jar 1
#-m TER-BLEU nocase punc 20 50 joshua/zmert/tercom-0.7.25/tercom.7.25.jar 1 4 closest
#-m METEOR en norm_yes keepPunc 2 #old meteor interface #Z-MERT Meteor interface(not working)
#-m Meteor en lowercase '0.5 1.0 0.5 0.5' 'exact stem synonym paraphrase' '1.0 0.5 0.5 0.5' #CMU meteor interface
# maximum PRO iterations
# file containing commands to run decoder
# file prodcued by decoder
# decoder config file
# size of N-best list
-N 300
# verbosity level (0-2; higher value => more verbose)
-v 1
#use one of the classifiers(and the corresponding parameter setting) below:
#1.perceptron paramters
-classifierParams '30 0.5 0.0'
#2.MegaM parameters
#-classifierParams './megam_command ./ ./megam_weights'
#3.Stanford Max-Ent parameters
#-classifierParams './maxent_prop_file'
#4.LibSVM parameters
#-classifierParams './libsvm_command ./ ./'
# num of candidate samples
-Tau 8000
# num of top candidates
-Xi 50
# linear interpolation coef. range:[0,1]. 1=using new weights only; 0=using previous weights only
-interCoef 0.5
# threshold for sample selection
-metricDiff 0.05
MIRA_CONFIG_TEMPLATE = """### Part 1: parameters similar to Z-MERT
# target sentences file name (in this case, file name prefix)
-r <REF>
# references per sentence
-rps <NUMREFS>
# parameter file
-p <TUNEDIR>/params.txt
#metric setting:
#-m TER nocase punc 5 5 joshua/zmert/tercom-0.7.25/tercom.7.25.jar 1
#-m TER-BLEU nocase punc 20 50 joshua/zmert/tercom-0.7.25/tercom.7.25.jar 1 4 closest
#-m METEOR en norm_yes keepPunc 2 #old meteor interface #Z-MERT Meteor interface(not working)
#-m Meteor en lowercase '0.5 1.0 0.5 0.5' 'exact stem synonym paraphrase' '1.0 0.5 0.5 0.5' #CMU meteor interface
# maximum MIRA iterations
# file containing commands to run decoder
# file prodcued by decoder
# decoder config file
# size of N-best list
-N 300
# verbosity level (0-2; higher value => more verbose)
-v 1
### PART 2: MIRA parameters
#oracle selection method:
#1: "hope"(default)
#2: best metric score(ex: max BLEU)
-oracleSelection 1
#prediction selection method:
#1: "fear"(default)
#2: max model score
#3: worst metric score(ex: min BLEU)
-predictionSelection 1
#shuffle the training samples? (default:1)
-needShuffle 1
#average the weights after each epoch? (default:1)
-needAvg 1
#when use BLEU/TER-BLEU as metric, use the pseudo corpus to compute BLEU? (default:1)
-usePseudoCorpus 1
#corpus decay coefficient (only valid when pseudo corpus is used for BLEU, default:0.99)
-corpusDecay 0.99
#scale the model score(in order to make it comparable to the metric score)?(default:1)
-needScaling 1
#options for scaling (only valid when -needScaling=1)
-scoreRatio 5 #scale the model score so that abs(model_score/metric_score) \approx scoreRatio (default:5)
#MIRA internal iterations (default:1)
#-miraIter 1
#regularization parameter (default:0.01)
-C 0.01
#run perceptron mode? (default:0)
-runPercep 0
ADAGRAD_CONFIG_TEMPLATE = """### Part 1: parameters similar to Z-MERT
# target sentences file name (in this case, file name prefix)
-r <REF>
# references per sentence
-rps <NUMREFS>
# parameter file
-p <TUNEDIR>/params.txt
#metric setting:
#-m TER nocase punc 5 5 joshua/zmert/tercom-0.7.25/tercom.7.25.jar 1
#-m TER-BLEU nocase punc 20 50 joshua/zmert/tercom-0.7.25/tercom.7.25.jar 1 4 closest
#-m METEOR en norm_yes keepPunc 2 #old meteor interface #Z-MERT Meteor interface(not working)
#-m Meteor en lowercase '0.5 1.0 0.5 0.5' 'exact stem synonym paraphrase' '1.0 0.5 0.5 0.5' #CMU meteor interface
# maximum iterations
# file containing commands to run decoder
# file prodcued by decoder
# decoder config file
# size of N-best list
-N 300
# verbosity level (0-2; higher value => more verbose)
-v 1
### PART 2: AdaGrad parameters
#oracle selection method:
#1: "hope"(default)
#2: best metric score(ex: max BLEU)
-oracleSelection 1
#prediction selection method:
#1: "fear"(default)
#2: max model score
#3: worst metric score(ex: min BLEU)
-predictionSelection 1
#shuffle the training samples? (default:1)
-needShuffle 1
#average the weights after each epoch? (default:1)
-needAvg 1
#return the best weights during tuning? (default:1)
-returnBest 1
#when use BLEU/TER-BLEU as metric, use the pseudo corpus to compute BLEU? (default:1)
-usePseudoCorpus 1
#corpus decay coefficient (only valid when pseudo corpus is used for BLEU, default:0.99)
-corpusDecay 0.99
#scale the model score(in order to make it comparable to the metric score)?(default:1)
-needScaling 1
#options for scaling (only valid when -needScaling=1)
-scoreRatio 5 #scale the model score so that abs(model_score/metric_score) \approx scoreRatio (default:5)
#regularzation (0: no reg 1: l1-reg; 2: l2-reg. Default: 2)
-regularization 2
#regularization coefficient
-lambda 0.1
#step size coefficient
-eta 0.1
#mini-batch size (default: 10)
-batchSize 10
WordPenalty ||| -2.844814 Opt -Inf +Inf -5 0
OOVPenalty ||| 1 Fix 0 0 0 0
normalization = absval 1 lm_0
def write_template(template, path, lookup):
"""Writes a template file, substituting variables of the form <NAME> for values found
in the 'lookup' hash.
out = open(path, 'w')
for line in template.split('\n'):
line = re.sub(r'<(.*?)>', lambda m: '{0}'.format(lookup[]), line)
out.write(line + '\n')
def parse_tm_line(line):
"""Parses a TM line and returns the owner, span, and path. Works on both the
old TM format:
tm = moses pt 0 /path/to/grammar
and the new one:
tm = moses -owner pt -path /path/to/grammar -maxspan 0
line = re.sub(r'tm\s*=\s*', '', line).strip()
owner = ''
maxspan = ''
path = ''
if '-path' in line:
# new format
grammartype, rest = line.split(' ', 1)
tokens = rest.split(' ')
for i in range(0, len(tokens), 2):
key = tokens[i]
value = tokens[i+1]
if key == '-path':
path = value
elif key == '-owner':
owner = value
elif key == '-maxspan':
maxspan = value
# old format
grammartype, owner, maxspan, path = line.split(' ')
return (owner, maxspan, path)
def get_features(config_file):
"""Queries the decoder for all dense features that will be fired by the feature
functions activated in the config file"""
output = check_output("%s/bin/joshua-decoder -c %s -show-weights -v 0" % (JOSHUA, config_file), shell=True)
features = []
for index, item in enumerate(output.split('\n'.encode(encoding='utf_8', errors='strict'))):
item = item.decode()
if item != "":
return features
def get_num_refs(prefix):
"""Determines how many references there are."""
for ext in ['.', '']:
if os.path.exists('%s%s0' % (prefix, ext)):
suffix = 0
while os.path.exists('%s%s%d' % (prefix, ext, suffix)):
suffix += 1
return suffix
if os.path.exists(prefix):
return 1
return 0
def safe_symlink(to_path, from_path):
if os.path.isfile(from_path) or os.path.islink(from_path):
os.symlink(to_path, from_path)
def setup_configs(template, template_dest, target, num_refs, tunedir, command, config, output, metric, iterations):
"""Writes the config files for both Z-MERT and PRO (which run on the same codebase).
Both of them write the file "params.txt", but they use different names for the config file,
so that is a parameter."""
write_template(template, template_dest,
{ 'REF': target,
'NUMREFS': num_refs,
'TUNEDIR': tunedir,
'METRIC': metric,
'ITERATIONS': iterations,
'DECODER_OUTPUT': output })
# Query the decoder for the list of dense parameters. These need to be listed in the
# config file or MERT will not know about them, despite them being listed in params.txt.
params = []
for feature,weight in get_features(config):
if feature.startswith('lm_'):
params.append('%s ||| %s Opt 0.1 +Inf +0.5 +1.5' % (feature, weight))
params.append('%s ||| %s Opt -Inf +Inf -1 +1' % (feature, weight))
paramstr = '\n'.join(params)
write_template(PARAMS_TEMPLATE, '%s/params.txt' % (tunedir),
{ 'REF': target,
'NUMREFS': num_refs,
'TUNEDIR': tunedir,
'PARAMS': paramstr })
def run_zmert(tunedir, source, target, command, config, output, opts):
"""Runs Z-MERT after setting up all its file requirements."""
setup_configs(ZMERT_CONFIG_TEMPLATE, '%s/mert.config' % (tunedir),
target, get_num_refs(target), tunedir, command, config, output,
opts.metric, opts.iterations or 10)
tuner_mem = '10g'
call("java -d64 -Xmx%s -cp %s/target/joshua-*-jar-with-dependencies.jar org.apache.joshua.zmert.ZMERT -maxMem 4000 %s/mert.config > %s/mert.log 2>&1" % (tuner_mem, JOSHUA, tunedir, tunedir), shell=True)
os.path.join(tunedir, ''))
def run_pro(tunedir, source, target, command, config, output, opts):
"""Runs PRO after setting up all its file requirements."""
setup_configs(PRO_CONFIG_TEMPLATE, '%s/pro.config' % (tunedir),
target, get_num_refs(target), tunedir, command, config, output,
opts.metric, opts.iterations or 30)
tuner_mem = '10g'
call("java -d64 -Xmx%s -cp %s/target/joshua-*-jar-with-dependencies.jar %s/pro.config > %s/pro.log 2>&1" % (tuner_mem, JOSHUA, tunedir, tunedir), shell=True)
os.path.join(tunedir, ''))
def run_mira(tunedir, source, target, command, config, output, opts):
"""Runs MIRA after setting up all its file requirements."""
setup_configs(MIRA_CONFIG_TEMPLATE, '%s/mira.config' % (tunedir),
target, get_num_refs(target), tunedir, command, config, output,
opts.metric, opts.iterations or 5)
tuner_mem = '10g'
call("java -d64 -Xmx%s -cp %s/target/joshua-*-jar-with-dependencies.jar org.apache.joshua.mira.MIRA %s/mira.config > %s/mira.log 2>&1" % (tuner_mem, JOSHUA, tunedir, tunedir), shell=True)
os.path.join(tunedir, ''))
def run_adagrad(tunedir, source, target, command, config, output, opts):
"""Runs ADAGRAD after setting up all its file requirements."""
setup_configs(ADAGRAD_CONFIG_TEMPLATE, '%s/adagrad.config' % (tunedir),
target, get_num_refs(target), tunedir, command, config, output,
opts.metric, opts.iterations or 10)
tuner_mem = '10g'
call("java -d64 -Xmx%s -cp %s/target/joshua-*-jar-with-dependencies.jar org.apache.joshua.adagrad.AdaGrad %s/adagrad.config > %s/adagrad.log 2>&1" % (tuner_mem, JOSHUA, tunedir, tunedir), shell=True)
os.path.join(tunedir, ''))
def error_quit(message):
def handle_args(clargs):
Process the command-line options
class MyParser(argparse.ArgumentParser):
def error(self, message):
logging.error('ERROR: %s\n' % message)
# Parse the command-line arguments.
parser = MyParser(description='run the Z-MERT or PRO tuners ')
parser.add_argument('source', help='path to source file')
parser.add_argument('target', help='path to reference file (optionally a prefix)')
'-d', '--tunedir', default='SDFW',
help='path to tuning directory')
'--tuner', default='zmert',
help='which tuner to use: zmert, pro, mira, or adagrad')
'--decoder', default='tune/decoder_command',
help='The path to the decoder or wrapper script. This script is responsible for '
'producing the output file in the location specified by the path passed to '
'--decoder-output-file. It is not passed the source file, so it needs to arrange '
'for that on its own.'
'--decoder-config', default='tune/model/joshua.config',
help='location of decoder configuration file. This file is used to read the set of '
'feature functions so that tuning parameters can be setup for each weight'
'--decoder-output-file', default='tune/output.nbest',
help='location of n-best output file produced by --decoder')
'--decoder-log-file', default='tune/joshua.log',
help='location of decoder n-best log file')
'-i', '--iterations', type=int,
help='the maximum number of iterations to run the tuner for')
'-m', '--metric', default='BLEU 4 closest',
help='the metric to optimize')
'-v', '--verbose', action='store_true',
help='print informational messages'
return parser.parse_args(clargs)
def main(argv):
opts = handle_args(argv[1:])
level=logging.DEBUG if opts.verbose else logging.WARNING,
format='* %(message)s'
if not os.path.exists(opts.tunedir):
if opts.tuner in ['mert', 'zmert']:
run_zmert(opts.tunedir, opts.source,, opts.decoder, opts.decoder_config, opts.decoder_output_file, opts)
elif opts.tuner == 'pro':
run_pro(opts.tunedir, opts.source,, opts.decoder, opts.decoder_config, opts.decoder_output_file, opts)
elif 'mira' in opts.tuner:
run_mira(opts.tunedir, opts.source,, opts.decoder, opts.decoder_config, opts.decoder_output_file, opts)
elif 'adagrad' in opts.tuner:
run_adagrad(opts.tunedir, opts.source,, opts.decoder, opts.decoder_config, opts.decoder_output_file, opts)
if __name__ == "__main__":
assert JOSHUA
except AssertionError:
error_quit('ERROR: The JOSHUA environment variable must be defined.')