| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.joshua.zmert; |
| |
| import java.io.BufferedReader; |
| import java.io.BufferedWriter; |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.FileNotFoundException; |
| import java.io.FileOutputStream; |
| import java.io.FileReader; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.ObjectInputStream; |
| import java.io.ObjectOutputStream; |
| import java.io.OutputStream; |
| import java.io.OutputStreamWriter; |
| import java.io.PrintWriter; |
| import java.text.DecimalFormat; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.Date; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Random; |
| import java.util.Scanner; |
| import java.util.TreeSet; |
| import java.util.Vector; |
| import java.util.concurrent.ConcurrentHashMap; |
| import java.util.concurrent.ExecutorService; |
| import java.util.concurrent.Executors; |
| import java.util.concurrent.Semaphore; |
| import java.util.zip.GZIPInputStream; |
| import java.util.zip.GZIPOutputStream; |
| |
| import org.apache.joshua.decoder.Decoder; |
| import org.apache.joshua.decoder.JoshuaConfiguration; |
| import org.apache.joshua.metrics.EvaluationMetric; |
| import org.apache.joshua.util.StreamGobbler; |
| import org.apache.joshua.util.io.ExistingUTF8EncodedTextFile; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * This code was originally written by Omar Zaidan. In September of 2012, it was augmented to support |
| * a sparse feature implementation. |
| * |
| * @author Omar Zaidan |
| */ |
| |
| public class MertCore { |
| |
| private static final Logger LOG = LoggerFactory.getLogger(MertCore.class); |
| |
| private final JoshuaConfiguration joshuaConfiguration; |
| private TreeSet<Integer>[] indicesOfInterest_all; |
| |
| private final static DecimalFormat f4 = new DecimalFormat("###0.0000"); |
| |
| private final static double NegInf = Double.NEGATIVE_INFINITY; |
| private final static double PosInf = Double.POSITIVE_INFINITY; |
| private final static double epsilon = 1.0 / 1000000; |
| |
| private int verbosity; // anything of priority <= verbosity will be printed |
| // (lower value for priority means more important) |
| |
| private Random randGen; |
| private int generatedRands; |
| |
| private int numSentences; |
| // number of sentences in the dev set |
| // (aka the "MERT training" set) |
| |
| private int numDocuments; |
| // number of documents in the dev set |
| // this should be 1, unless doing doc-level optimization |
| |
| private int[] docOfSentence; |
| // docOfSentence[i] stores which document contains the i'th sentence. |
| // docOfSentence is 0-indexed, as are the documents (i.e. first doc is indexed 0) |
| |
| private int[] docSubsetInfo; |
| // stores information regarding which subset of the documents are evaluated |
| // [0]: method (0-6) |
| // [1]: first (1-indexed) |
| // [2]: last (1-indexed) |
| // [3]: size |
| // [4]: center |
| // [5]: arg1 |
| // [6]: arg2 |
| // [1-6] are 0 for method 0, [6] is 0 for methods 1-4 as well |
| // only [1] and [2] are needed for optimization. The rest are only needed for an output message. |
| |
| private int refsPerSen; |
| // number of reference translations per sentence |
| |
| private int textNormMethod; |
| // 0: no normalization, 1: "NIST-style" tokenization, and also rejoin 'm, 're, *'s, 've, 'll, 'd, |
| // and n't, |
| // 2: apply 1 and also rejoin dashes between letters, 3: apply 1 and also drop non-ASCII |
| // characters |
| // 4: apply 1+2+3 |
| |
| private int numParams; |
| // number of features for the log-linear model |
| |
| private double[] normalizationOptions; |
| // How should a lambda[] vector be normalized (before decoding)? |
| // nO[0] = 0: no normalization |
| // nO[0] = 1: scale so that parameter nO[2] has absolute value nO[1] |
| // nO[0] = 2: scale so that the maximum absolute value is nO[1] |
| // nO[0] = 3: scale so that the minimum absolute value is nO[1] |
| // nO[0] = 4: scale so that the L-nO[1] norm equals nO[2] |
| |
| /* *********************************************************** */ |
| /* NOTE: indexing starts at 1 in the following few arrays: */ |
| /* *********************************************************** */ |
| |
| private String[] paramNames; |
| // feature names, needed to read/create config file |
| |
| private double[] lambda; |
| // the current weight vector. NOTE: indexing starts at 1. |
| |
| private boolean[] isOptimizable; |
| // isOptimizable[c] = true iff lambda[c] should be optimized |
| |
| private double[] minThValue; |
| private double[] maxThValue; |
| // when investigating thresholds along the lambda[c] dimension, only values |
| // in the [minThValue[c],maxThValue[c]] range will be considered. |
| // (*) minThValue and maxThValue can be real values as well as -Infinity and +Infinity |
| // (coded as -Inf and +Inf, respectively, in an input file) |
| |
| private double[] minRandValue; |
| private double[] maxRandValue; |
| // when choosing a random value for the lambda[c] parameter, it will be |
| // chosen from the [minRandValue[c],maxRandValue[c]] range. |
| // (*) minRandValue and maxRandValue must be real values, but not -Inf or +Inf |
| |
| private int damianos_method; |
| private double damianos_param; |
| private double damianos_mult; |
| |
| private double[] defaultLambda; |
| // "default" parameter values; simply the values read in the parameter file |
| |
| /* *********************************************************** */ |
| /* *********************************************************** */ |
| |
| private Decoder myDecoder; |
| // COMMENT OUT if decoder is not Joshua |
| |
| private String decoderCommand; |
| // the command that runs the decoder; read from decoderCommandFileName |
| |
| private int decVerbosity; |
| // verbosity level for decoder output. If 0, decoder output is ignored. |
| // If 1, decoder output is printed. |
| |
| private int validDecoderExitValue; |
| // return value from running the decoder command that indicates success |
| |
| private int numOptThreads; |
| // number of threads to run things in parallel |
| |
| private int saveInterFiles; |
| // 0: nothing, 1: only configs, 2: only n-bests, 3: both configs and n-bests |
| |
| private int compressFiles; |
| // should Z-MERT gzip the large files? If 0, no compression takes place. |
| // If 1, compression is performed on: decoder output files, temp sents files, |
| // and temp feats files. |
| |
| private int sizeOfNBest; |
| // size of N-best list generated by decoder at each iteration |
| // (aka simply N, but N is a bad variable name) |
| |
| private long seed; |
| // seed used to create random number generators |
| |
| private boolean randInit; |
| // if true, parameters are initialized randomly. If false, parameters |
| // are initialized using values from parameter file. |
| |
| private int initsPerIt; |
| // number of intermediate initial points per iteration |
| |
| private int maxMERTIterations, minMERTIterations, prevMERTIterations; |
| // max: maximum number of MERT iterations |
| // min: minimum number of MERT iterations before an early MERT exit |
| // prev: number of previous MERT iterations from which to consider candidates (in addition to |
| // the candidates from the current iteration) |
| |
| private double stopSigValue; |
| // early MERT exit if no weight changes by more than stopSigValue |
| // (but see minMERTIterations above and stopMinIts below) |
| |
| private int stopMinIts; |
| // some early stopping criterion must be satisfied in stopMinIts *consecutive* iterations |
| // before an early exit (but see minMERTIterations above) |
| |
| private boolean oneModificationPerIteration; |
| // if true, each MERT iteration performs at most one parameter modification. |
| // If false, a new MERT iteration starts (i.e. a new N-best list is |
| // generated) only after the previous iteration reaches a local maximum. |
| |
| private String metricName; |
| // name of evaluation metric optimized by MERT |
| |
| private String metricName_display; |
| // name of evaluation metric optimized by MERT, possibly with "doc-level " prefixed |
| |
| private String[] metricOptions; |
| // options for the evaluation metric (e.g. for BLEU, maxGramLength and effLengthMethod) |
| |
| private EvaluationMetric evalMetric; |
| // the evaluation metric used by MERT |
| |
| private int suffStatsCount; |
| // number of sufficient statistics for the evaluation metric |
| |
| private String tmpDirPrefix; |
| // prefix for the ZMERT.temp.* files |
| |
| private boolean passIterationToDecoder; |
| // should the iteration number be passed as an argument to decoderCommandFileName? |
| // If 1, iteration number is passed. If 0, launch with no arguments. |
| |
| private String dirPrefix; // where are all these files located? |
| private String paramsFileName, docInfoFileName, finalLambdaFileName; |
| private String sourceFileName, refFileName, decoderOutFileName; |
| private String decoderConfigFileName, decoderCommandFileName; |
| private String fakeFileNameTemplate, fakeFileNamePrefix, fakeFileNameSuffix; |
| |
| // e.g. output.it[1-x].someOldRun would be specified as: |
| // output.it?.someOldRun |
| // and we'd have prefix = "output.it" and suffix = ".sameOldRun" |
| |
| // private int useDisk; |
| |
| public MertCore(JoshuaConfiguration joshuaConfiguration) |
| { |
| this.joshuaConfiguration = joshuaConfiguration; |
| } |
| |
| public MertCore(String[] args, JoshuaConfiguration joshuaConfiguration) throws FileNotFoundException, IOException { |
| this.joshuaConfiguration = joshuaConfiguration; |
| EvaluationMetric.set_knownMetrics(); |
| processArgsArray(args); |
| initialize(0); |
| } |
| |
| public MertCore(String configFileName,JoshuaConfiguration joshuaConfiguration) throws FileNotFoundException, IOException { |
| this.joshuaConfiguration = joshuaConfiguration; |
| EvaluationMetric.set_knownMetrics(); |
| processArgsArray(cfgFileToArgsArray(configFileName)); |
| initialize(0); |
| } |
| |
| private void initialize(int randsToSkip) throws FileNotFoundException, IOException { |
| println("NegInf: " + NegInf + ", PosInf: " + PosInf + ", epsilon: " + epsilon, 4); |
| |
| randGen = new Random(seed); |
| for (int r = 1; r <= randsToSkip; ++r) { |
| randGen.nextDouble(); |
| } |
| generatedRands = randsToSkip; |
| |
| if (randsToSkip == 0) { |
| println("----------------------------------------------------", 1); |
| println("Initializing...", 1); |
| println("----------------------------------------------------", 1); |
| println("", 1); |
| |
| println("Random number generator initialized using seed: " + seed, 1); |
| println("", 1); |
| } |
| |
| if (refsPerSen > 1) { |
| String refFile = refFileName + "0"; |
| if (! new File(refFile).exists()) |
| refFile = refFileName + ".0"; |
| if (! new File(refFile).exists()) { |
| throw new IOException(String.format("* FATAL: can't find first reference file '%s{0,.0}'", refFileName)); |
| } |
| |
| numSentences = new ExistingUTF8EncodedTextFile(refFile).getNumberOfLines(); |
| } else { |
| numSentences = new ExistingUTF8EncodedTextFile(refFileName).getNumberOfLines(); |
| } |
| |
| processDocInfo(); |
| // sets numDocuments and docOfSentence[] |
| |
| if (numDocuments > 1) metricName_display = "doc-level " + metricName; |
| |
| set_docSubsetInfo(docSubsetInfo); |
| |
| |
| |
| numParams = new ExistingUTF8EncodedTextFile(paramsFileName).getNumberOfNonEmptyLines() - 1; |
| // the parameter file contains one line per parameter |
| // and one line for the normalization method |
| |
| |
| paramNames = new String[1 + numParams]; |
| lambda = new double[1 + numParams]; // indexing starts at 1 in these arrays |
| isOptimizable = new boolean[1 + numParams]; |
| minThValue = new double[1 + numParams]; |
| maxThValue = new double[1 + numParams]; |
| minRandValue = new double[1 + numParams]; |
| maxRandValue = new double[1 + numParams]; |
| // precision = new double[1+numParams]; |
| defaultLambda = new double[1 + numParams]; |
| normalizationOptions = new double[3]; |
| |
| try { |
| // read parameter names |
| BufferedReader inFile_names = new BufferedReader(new FileReader(paramsFileName)); |
| |
| for (int c = 1; c <= numParams; ++c) { |
| String line = ""; |
| while (line != null && line.length() == 0) { // skip empty lines |
| line = inFile_names.readLine(); |
| } |
| String paramName = (line.substring(0, line.indexOf("|||"))).trim(); |
| paramNames[c] = paramName; |
| } |
| |
| inFile_names.close(); |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| |
| processParamFile(); |
| // sets the arrays declared just above |
| |
| // SentenceInfo.createV(); // uncomment ONLY IF using vocabulary implementation of SentenceInfo |
| |
| |
| String[][] refSentences = new String[numSentences][refsPerSen]; |
| |
| try { |
| |
| // read in reference sentences |
| BufferedReader reference_readers[] = new BufferedReader[refsPerSen]; |
| if (refsPerSen == 1) { |
| reference_readers[0] = new BufferedReader(new InputStreamReader(new FileInputStream(new File(refFileName)), "utf8")); |
| } else { |
| for (int i = 0; i < refsPerSen; i++) { |
| String refFile = refFileName + i; |
| if (! new File(refFile).exists()) |
| refFile = refFileName + "." + i; |
| if (! new File(refFile).exists()) { |
| throw new RuntimeException(String.format("* FATAL: can't find reference file '%s'", refFile)); |
| } |
| |
| reference_readers[i] = new BufferedReader(new InputStreamReader(new FileInputStream(new File(refFile)), "utf8")); |
| } |
| } |
| |
| for (int i = 0; i < numSentences; ++i) { |
| for (int r = 0; r < refsPerSen; ++r) { |
| // read the rth reference translation for the ith sentence |
| refSentences[i][r] = normalize(reference_readers[r].readLine(), textNormMethod); |
| } |
| } |
| |
| // close all the reference files |
| for (int i = 0; i < refsPerSen; i++) |
| reference_readers[i].close(); |
| |
| // read in decoder command, if any |
| decoderCommand = null; |
| if (decoderCommandFileName != null) { |
| if (fileExists(decoderCommandFileName)) { |
| BufferedReader inFile_comm = new BufferedReader(new FileReader(decoderCommandFileName)); |
| decoderCommand = inFile_comm.readLine(); |
| inFile_comm.close(); |
| } |
| } |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| |
| |
| // set static data members for the EvaluationMetric class |
| EvaluationMetric.set_numSentences(numSentences); |
| EvaluationMetric.set_numDocuments(numDocuments); |
| EvaluationMetric.set_refsPerSen(refsPerSen); |
| EvaluationMetric.set_refSentences(refSentences); |
| EvaluationMetric.set_tmpDirPrefix(tmpDirPrefix); |
| |
| evalMetric = EvaluationMetric.getMetric(metricName, metricOptions); |
| |
| suffStatsCount = evalMetric.get_suffStatsCount(); |
| |
| // set static data members for the IntermediateOptimizer class |
| IntermediateOptimizer.set_MERTparams(numSentences, numDocuments, docOfSentence, docSubsetInfo, |
| numParams, normalizationOptions, isOptimizable, minThValue, maxThValue, |
| oneModificationPerIteration, evalMetric, tmpDirPrefix, verbosity); |
| |
| |
| |
| if (randsToSkip == 0) { // i.e. first iteration |
| println("Number of sentences: " + numSentences, 1); |
| println("Number of documents: " + numDocuments, 1); |
| println("Optimizing " + metricName_display, 1); |
| |
| print("docSubsetInfo: {", 1); |
| for (int f = 0; f < 6; ++f) |
| print(docSubsetInfo[f] + ", ", 1); |
| println(docSubsetInfo[6] + "}", 1); |
| |
| println("Number of features: " + numParams, 1); |
| print("Feature names: {", 1); |
| for (int c = 1; c <= numParams; ++c) { |
| print("\"" + paramNames[c] + "\"", 1); |
| if (c < numParams) print(",", 1); |
| } |
| println("}", 1); |
| println("", 1); |
| |
| println("c Default value\tOptimizable?\tCrit. val. range\tRand. val. range", 1); |
| |
| for (int c = 1; c <= numParams; ++c) { |
| print(c + " " + f4.format(lambda[c]) + "\t\t", 1); |
| if (!isOptimizable[c]) { |
| println(" No", 1); |
| } else { |
| print(" Yes\t\t", 1); |
| // print("[" + minThValue[c] + "," + maxThValue[c] + "] @ " + precision[c] + |
| // " precision",1); |
| print(" [" + minThValue[c] + "," + maxThValue[c] + "]", 1); |
| print("\t\t", 1); |
| print(" [" + minRandValue[c] + "," + maxRandValue[c] + "]", 1); |
| println("", 1); |
| } |
| } |
| |
| println("", 1); |
| print("Weight vector normalization method: ", 1); |
| if (normalizationOptions[0] == 0) { |
| println("none.", 1); |
| } else if (normalizationOptions[0] == 1) { |
| println("weights will be scaled so that the \"" + paramNames[(int) normalizationOptions[1]] |
| + "\" weight has an absolute value of " + normalizationOptions[2] + ".", 1); |
| } else if (normalizationOptions[0] == 2) { |
| println("weights will be scaled so that the maximum absolute value is " |
| + normalizationOptions[1] + ".", 1); |
| } else if (normalizationOptions[0] == 3) { |
| println("weights will be scaled so that the minimum absolute value is " |
| + normalizationOptions[1] + ".", 1); |
| } else if (normalizationOptions[0] == 4) { |
| println("weights will be scaled so that the L-" + normalizationOptions[1] + " norm is " |
| + normalizationOptions[2] + ".", 1); |
| } |
| |
| println("", 1); |
| |
| println("----------------------------------------------------", 1); |
| println("", 1); |
| |
| // rename original config file so it doesn't get overwritten |
| // (original name will be restored in finish()) |
| renameFile(decoderConfigFileName, decoderConfigFileName + ".ZMERT.orig"); |
| |
| } // if (randsToSkip == 0) |
| |
| |
| if (decoderCommand == null && fakeFileNameTemplate == null) { |
| println("Loading Joshua decoder...", 1); |
| myDecoder = new Decoder(joshuaConfiguration, decoderConfigFileName + ".ZMERT.orig"); |
| println("...finished loading @ " + (new Date()), 1); |
| println(""); |
| } else { |
| myDecoder = null; |
| } |
| |
| |
| |
| @SuppressWarnings("unchecked") |
| TreeSet<Integer>[] temp_TSA = new TreeSet[numSentences]; |
| indicesOfInterest_all = temp_TSA; |
| |
| for (int i = 0; i < numSentences; ++i) { |
| indicesOfInterest_all[i] = new TreeSet<>(); |
| } |
| |
| |
| } // void initialize(...) |
| |
| public void run_MERT() { |
| run_MERT(minMERTIterations, maxMERTIterations, prevMERTIterations); |
| } |
| |
| public void run_MERT(int minIts, int maxIts, int prevIts) { |
| println("----------------------------------------------------", 1); |
| println("Z-MERT run started @ " + (new Date()), 1); |
| // printMemoryUsage(); |
| println("----------------------------------------------------", 1); |
| println("", 1); |
| |
| if (randInit) { |
| println("Initializing lambda[] randomly.", 1); |
| |
| // initialize optimizable parameters randomly (sampling uniformly from |
| // that parameter's random value range) |
| lambda = randomLambda(); |
| } |
| |
| println("Initial lambda[]: " + lambdaToString(lambda), 1); |
| println("", 1); |
| |
| double FINAL_score = evalMetric.worstPossibleScore(); |
| |
| |
| // int[] lastUsedIndex = new int[numSentences]; |
| int[] maxIndex = new int[numSentences]; |
| // used to grow featVal_array dynamically |
| // HashMap<Integer,int[]>[] suffStats_array = new HashMap[numSentences]; |
| // suffStats_array[i] maps candidates of interest for sentence i to an array |
| // storing the sufficient statistics for that candidate |
| for (int i = 0; i < numSentences; ++i) { |
| // lastUsedIndex[i] = -1; |
| maxIndex[i] = sizeOfNBest - 1; |
| // suffStats_array[i] = new HashMap<Integer,int[]>(); |
| } |
| /* |
| * double[][][] featVal_array = new double[1+numParams][][]; // indexed by |
| * [param][sentence][candidate] featVal_array[0] = null; // param indexing starts at 1 for (int |
| * c = 1; c <= numParams; ++c) { featVal_array[c] = new double[numSentences][]; for (int i = 0; |
| * i < numSentences; ++i) { featVal_array[c][i] = new double[maxIndex[i]]; // will grow |
| * dynamically as needed } } |
| */ |
| int earlyStop = 0; |
| // number of consecutive iteration an early stopping criterion was satisfied |
| |
| for (int iteration = 1;; ++iteration) { |
| |
| double[] A = run_single_iteration(iteration, minIts, maxIts, prevIts, earlyStop, maxIndex); |
| if (A != null) { |
| FINAL_score = A[0]; |
| earlyStop = (int) A[1]; |
| if (A[2] == 1) break; |
| } else { |
| break; |
| } |
| |
| } // for (iteration) |
| |
| println("", 1); |
| |
| println("----------------------------------------------------", 1); |
| println("Z-MERT run ended @ " + (new Date()), 1); |
| // printMemoryUsage(); |
| println("----------------------------------------------------", 1); |
| println("", 1); |
| println("FINAL lambda: " + lambdaToString(lambda) + " (" + metricName_display + ": " |
| + FINAL_score + ")", 1); |
| // check if a lambda is outside its threshold range |
| for (int c = 1; c <= numParams; ++c) { |
| if (lambda[c] < minThValue[c] || lambda[c] > maxThValue[c]) { |
| println("Warning: after normalization, lambda[" + c + "]=" + f4.format(lambda[c]) |
| + " is outside its critical value range.", 1); |
| } |
| } |
| println("", 1); |
| |
| // delete intermediate .temp.*.it* decoder output files |
| for (int iteration = 1; iteration <= maxIts; ++iteration) { |
| if (compressFiles == 1) { |
| deleteFile(tmpDirPrefix + "temp.sents.it" + iteration + ".gz"); |
| deleteFile(tmpDirPrefix + "temp.feats.it" + iteration + ".gz"); |
| if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration + ".copy.gz")) { |
| deleteFile(tmpDirPrefix + "temp.stats.it" + iteration + ".copy.gz"); |
| } else { |
| deleteFile(tmpDirPrefix + "temp.stats.it" + iteration + ".gz"); |
| } |
| } else { |
| deleteFile(tmpDirPrefix + "temp.sents.it" + iteration); |
| deleteFile(tmpDirPrefix + "temp.feats.it" + iteration); |
| if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration + ".copy")) { |
| deleteFile(tmpDirPrefix + "temp.stats.it" + iteration + ".copy"); |
| } else { |
| deleteFile(tmpDirPrefix + "temp.stats.it" + iteration); |
| } |
| } |
| } |
| |
| } // void run_MERT(int maxIts) |
| |
| |
| @SuppressWarnings("unchecked") |
| public double[] run_single_iteration(int iteration, int minIts, int maxIts, int prevIts, |
| int earlyStop, int[] maxIndex) { |
| double FINAL_score = 0; |
| |
| double[] retA = new double[3]; |
| // retA[0]: FINAL_score |
| // retA[1]: earlyStop |
| // retA[2]: should this be the last iteration? |
| |
| boolean done = false; |
| retA[2] = 1; // will only be made 0 if we don't break from the following loop |
| |
| |
| double[][][] featVal_array = new double[1 + numParams][][]; |
| // indexed by [param][sentence][candidate] |
| featVal_array[0] = null; // param indexing starts at 1 |
| for (int c = 1; c <= numParams; ++c) { |
| featVal_array[c] = new double[numSentences][]; |
| for (int i = 0; i < numSentences; ++i) { |
| featVal_array[c][i] = new double[maxIndex[i] + 1]; |
| // will grow dynamically as needed |
| } |
| } |
| |
| |
| while (!done) { // NOTE: this "loop" will only be carried out once |
| println("--- Starting Z-MERT iteration #" + iteration + " @ " + (new Date()) + " ---", 1); |
| |
| // printMemoryUsage(); |
| |
| // run the decoder on all the sentences, producing for each sentence a set of |
| // sizeOfNBest candidates, with numParams feature values for each candidate |
| |
| /******************************/ |
| // CREATE DECODER CONFIG FILE // |
| /******************************/ |
| |
| createConfigFile(lambda, decoderConfigFileName, decoderConfigFileName + ".ZMERT.orig"); |
| // i.e. use the original config file as a template |
| |
| /***************/ |
| // RUN DECODER // |
| /***************/ |
| |
| if (iteration == 1) { |
| println("Decoding using initial weight vector " + lambdaToString(lambda), 1); |
| } else { |
| println("Redecoding using weight vector " + lambdaToString(lambda), 1); |
| } |
| |
| String[] decRunResult = run_decoder(iteration); // iteration passed in case fake decoder will |
| // be used |
| // [0] name of file to be processed |
| // [1] indicates how the output file was obtained: |
| // 1: external decoder |
| // 2: fake decoder |
| // 3: internal decoder |
| |
| if (!decRunResult[1].equals("2")) { |
| println("...finished decoding @ " + (new Date()), 1); |
| } |
| |
| checkFile(decRunResult[0]); |
| |
| println("Producing temp files for iteration " + iteration, 3); |
| |
| produceTempFiles(decRunResult[0], iteration); |
| |
| if (saveInterFiles == 1 || saveInterFiles == 3) { // make copy of intermediate config file |
| if (!copyFile(decoderConfigFileName, decoderConfigFileName + ".ZMERT.it" + iteration)) { |
| println("Warning: attempt to make copy of decoder config file (to create" |
| + decoderConfigFileName + ".ZMERT.it" + iteration + ") was unsuccessful!", 1); |
| } |
| } |
| if (saveInterFiles == 2 || saveInterFiles == 3) { // make copy of intermediate decoder output |
| // file... |
| |
| if (!decRunResult[1].equals("2")) { // ...but only if no fake decoder |
| if (!decRunResult[0].endsWith(".gz")) { |
| if (!copyFile(decRunResult[0], decRunResult[0] + ".ZMERT.it" + iteration)) { |
| println("Warning: attempt to make copy of decoder output file (to create" |
| + decRunResult[0] + ".ZMERT.it" + iteration + ") was unsuccessful!", 1); |
| } |
| } else { |
| String prefix = decRunResult[0].substring(0, decRunResult[0].length() - 3); |
| if (!copyFile(prefix + ".gz", prefix + ".ZMERT.it" + iteration + ".gz")) { |
| println("Warning: attempt to make copy of decoder output file (to create" + prefix |
| + ".ZMERT.it" + iteration + ".gz" + ") was unsuccessful!", 1); |
| } |
| } |
| |
| if (compressFiles == 1 && !decRunResult[0].endsWith(".gz")) { |
| gzipFile(decRunResult[0] + ".ZMERT.it" + iteration); |
| } |
| } // if (!fake) |
| |
| } |
| |
| int[] candCount = new int[numSentences]; |
| int[] lastUsedIndex = new int[numSentences]; |
| ConcurrentHashMap<Integer, int[]>[] suffStats_array = new ConcurrentHashMap[numSentences]; |
| for (int i = 0; i < numSentences; ++i) { |
| candCount[i] = 0; |
| lastUsedIndex[i] = -1; |
| // suffStats_array[i].clear(); |
| suffStats_array[i] = new ConcurrentHashMap<>(); |
| } |
| |
| double[][] initialLambda = new double[1 + initsPerIt][1 + numParams]; |
| // the intermediate "initial" lambdas |
| double[][] finalLambda = new double[1 + initsPerIt][1 + numParams]; |
| // the intermediate "final" lambdas |
| |
| // set initialLambda[][] |
| System.arraycopy(lambda, 1, initialLambda[1], 1, numParams); |
| for (int j = 2; j <= initsPerIt; ++j) { |
| if (damianos_method == 0) { |
| initialLambda[j] = randomLambda(); |
| } else { |
| initialLambda[j] = |
| randomPerturbation(initialLambda[1], iteration, damianos_method, damianos_param, |
| damianos_mult); |
| } |
| } |
| |
| // double[] initialScore = new double[1 + initsPerIt]; |
| double[] finalScore = new double[1 + initsPerIt]; |
| |
| int[][][] best1Cand_suffStats = new int[1 + initsPerIt][numSentences][suffStatsCount]; |
| double[][] best1Score = new double[1 + initsPerIt][numSentences]; |
| // Those two arrays are used to calculate initialScore[] |
| // (the "score" in best1Score refers to that assigned by the |
| // decoder; the "score" in initialScore refers to that |
| // assigned by the evaluation metric) |
| |
| int firstIt = Math.max(1, iteration - prevIts); |
| // i.e. only process candidates from the current iteration and candidates |
| // from up to prevIts previous iterations. |
| println("Reading candidate translations from iterations " + firstIt + "-" + iteration, 1); |
| println("(and computing " + metricName |
| + " sufficient statistics for previously unseen candidates)", 1); |
| print(" Progress: "); |
| |
| int[] newCandidatesAdded = new int[1 + iteration]; |
| for (int it = 1; it <= iteration; ++it) { |
| newCandidatesAdded[it] = 0; |
| } |
| |
| |
| |
| try { |
| |
| // each inFile corresponds to the output of an iteration |
| // (index 0 is not used; no corresponding index for the current iteration) |
| BufferedReader[] inFile_sents = new BufferedReader[iteration]; |
| BufferedReader[] inFile_feats = new BufferedReader[iteration]; |
| BufferedReader[] inFile_stats = new BufferedReader[iteration]; |
| |
| for (int it = firstIt; it < iteration; ++it) { |
| InputStream inStream_sents, inStream_feats, inStream_stats; |
| if (compressFiles == 0) { |
| inStream_sents = new FileInputStream(tmpDirPrefix + "temp.sents.it" + it); |
| inStream_feats = new FileInputStream(tmpDirPrefix + "temp.feats.it" + it); |
| inStream_stats = new FileInputStream(tmpDirPrefix + "temp.stats.it" + it); |
| } else { |
| inStream_sents = |
| new GZIPInputStream( |
| new FileInputStream(tmpDirPrefix + "temp.sents.it" + it + ".gz")); |
| inStream_feats = |
| new GZIPInputStream( |
| new FileInputStream(tmpDirPrefix + "temp.feats.it" + it + ".gz")); |
| inStream_stats = |
| new GZIPInputStream( |
| new FileInputStream(tmpDirPrefix + "temp.stats.it" + it + ".gz")); |
| } |
| |
| inFile_sents[it] = new BufferedReader(new InputStreamReader(inStream_sents, "utf8")); |
| inFile_feats[it] = new BufferedReader(new InputStreamReader(inStream_feats, "utf8")); |
| inFile_stats[it] = new BufferedReader(new InputStreamReader(inStream_stats, "utf8")); |
| } |
| |
| |
| InputStream inStream_sentsCurrIt, inStream_featsCurrIt, inStream_statsCurrIt; |
| if (compressFiles == 0) { |
| inStream_sentsCurrIt = new FileInputStream(tmpDirPrefix + "temp.sents.it" + iteration); |
| inStream_featsCurrIt = new FileInputStream(tmpDirPrefix + "temp.feats.it" + iteration); |
| } else { |
| inStream_sentsCurrIt = |
| new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.sents.it" + iteration |
| + ".gz")); |
| inStream_featsCurrIt = |
| new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.feats.it" + iteration |
| + ".gz")); |
| } |
| |
| BufferedReader inFile_sentsCurrIt = |
| new BufferedReader(new InputStreamReader(inStream_sentsCurrIt, "utf8")); |
| BufferedReader inFile_featsCurrIt = |
| new BufferedReader(new InputStreamReader(inStream_featsCurrIt, "utf8")); |
| |
| BufferedReader inFile_statsCurrIt = null; // will only be used if statsCurrIt_exists below |
| // is set to true |
| PrintWriter outFile_statsCurrIt = null; // will only be used if statsCurrIt_exists below is |
| // set to false |
| boolean statsCurrIt_exists = false; |
| if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration)) { |
| inStream_statsCurrIt = new FileInputStream(tmpDirPrefix + "temp.stats.it" + iteration); |
| inFile_statsCurrIt = |
| new BufferedReader(new InputStreamReader(inStream_statsCurrIt, "utf8")); |
| statsCurrIt_exists = true; |
| copyFile(tmpDirPrefix + "temp.stats.it" + iteration, tmpDirPrefix + "temp.stats.it" |
| + iteration + ".copy"); |
| } else if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration + ".gz")) { |
| inStream_statsCurrIt = |
| new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.stats.it" + iteration |
| + ".gz")); |
| inFile_statsCurrIt = |
| new BufferedReader(new InputStreamReader(inStream_statsCurrIt, "utf8")); |
| statsCurrIt_exists = true; |
| copyFile(tmpDirPrefix + "temp.stats.it" + iteration + ".gz", tmpDirPrefix |
| + "temp.stats.it" + iteration + ".copy.gz"); |
| } else { |
| outFile_statsCurrIt = new PrintWriter(tmpDirPrefix + "temp.stats.it" + iteration); |
| } |
| |
| PrintWriter outFile_statsMerged = new PrintWriter(tmpDirPrefix + "temp.stats.merged"); |
| // write sufficient statistics from all the sentences |
| // from the output files into a single file |
| PrintWriter outFile_statsMergedKnown = |
| new PrintWriter(tmpDirPrefix + "temp.stats.mergedKnown"); |
| // write sufficient statistics from all the sentences |
| // from the output files into a single file |
| |
| FileOutputStream outStream_unknownCands = |
| new FileOutputStream(tmpDirPrefix + "temp.currIt.unknownCands", false); |
| OutputStreamWriter outStreamWriter_unknownCands = |
| new OutputStreamWriter(outStream_unknownCands, "utf8"); |
| BufferedWriter outFile_unknownCands = new BufferedWriter(outStreamWriter_unknownCands); |
| |
| PrintWriter outFile_unknownIndices = |
| new PrintWriter(tmpDirPrefix + "temp.currIt.unknownIndices"); |
| |
| |
| String sents_str, feats_str, stats_str; |
| |
| // BUG: this assumes a candidate string cannot be produced for two |
| // different source sentences, which is not necessarily true |
| // (It's not actually a bug, but only because existingCandStats gets |
| // cleared before moving to the next source sentence.) |
| // FIX: should be made an array, indexed by i |
| HashMap<String, String> existingCandStats = new HashMap<>(); |
| // Stores precalculated sufficient statistics for candidates, in case |
| // the same candidate is seen again. (SS stored as a String.) |
| // Q: Why do we care? If we see the same candidate again, aren't we going |
| // to ignore it? So, why do we care about the SS of this repeat candidate? |
| // A: A "repeat" candidate may not be a repeat candidate in later |
| // iterations if the user specifies a value for prevMERTIterations |
| // that causes MERT to skip candidates from early iterations. |
| double[] currFeatVal = new double[1 + numParams]; |
| String[] featVal_str; |
| |
| int totalCandidateCount = 0; |
| |
| |
| |
| int[] sizeUnknown_currIt = new int[numSentences]; |
| |
| |
| |
| for (int i = 0; i < numSentences; ++i) { |
| |
| for (int j = 1; j <= initsPerIt; ++j) { |
| best1Score[j][i] = NegInf; |
| } |
| |
| for (int it = firstIt; it < iteration; ++it) { |
| // Why up to but *excluding* iteration? |
| // Because the last iteration is handled a little differently, since |
| // the SS must be claculated (and the corresponding file created), |
| // which is not true for previous iterations. |
| |
| for (int n = 0; n <= sizeOfNBest; ++n) { |
| // Why up to and *including* sizeOfNBest? |
| // So that it would read the "||||||" separator even if there is |
| // a complete list of sizeOfNBest candidates. |
| |
| // for the nth candidate for the ith sentence, read the sentence, feature values, |
| // and sufficient statistics from the various temp files |
| |
| sents_str = inFile_sents[it].readLine(); |
| feats_str = inFile_feats[it].readLine(); |
| stats_str = inFile_stats[it].readLine(); |
| |
| if (sents_str.equals("||||||")) { |
| n = sizeOfNBest + 1; |
| } else if (!existingCandStats.containsKey(sents_str)) { |
| |
| outFile_statsMergedKnown.println(stats_str); |
| |
| featVal_str = feats_str.split("\\s+"); |
| |
| /* Sparse (labeled) feature version */ |
| if (feats_str.indexOf('=') != -1) { |
| for (String featurePair: featVal_str) { |
| String[] pair = featurePair.split("="); |
| String name = pair[0]; |
| Double value = Double.parseDouble(pair[1]); |
| currFeatVal[c_fromParamName(name)] = value; |
| } |
| } else { |
| for (int c = 1; c <= numParams; ++c) { |
| try { |
| currFeatVal[c] = Double.parseDouble(featVal_str[c - 1]); |
| } catch (Exception e) { |
| currFeatVal[c] = 0.0; |
| } |
| // print("fV[" + c + "]=" + currFeatVal[c] + " ",4); |
| } |
| // println("",4); |
| } |
| |
| |
| for (int j = 1; j <= initsPerIt; ++j) { |
| double score = 0; // i.e. score assigned by decoder |
| for (int c = 1; c <= numParams; ++c) { |
| score += initialLambda[j][c] * currFeatVal[c]; |
| } |
| if (score > best1Score[j][i]) { |
| best1Score[j][i] = score; |
| String[] tempStats = stats_str.split("\\s+"); |
| for (int s = 0; s < suffStatsCount; ++s) |
| best1Cand_suffStats[j][i][s] = Integer.parseInt(tempStats[s]); |
| } |
| } // for (j) |
| |
| existingCandStats.put(sents_str, stats_str); |
| |
| setFeats(featVal_array, i, lastUsedIndex, maxIndex, currFeatVal); |
| candCount[i] += 1; |
| |
| newCandidatesAdded[it] += 1; |
| |
| } // if unseen candidate |
| |
| } // for (n) |
| |
| } // for (it) |
| |
| outFile_statsMergedKnown.println("||||||"); |
| |
| |
| // now process the candidates of the current iteration |
| // now determine the new candidates of the current iteration |
| |
| /* |
| * remember: BufferedReader inFile_sentsCurrIt BufferedReader inFile_featsCurrIt |
| * PrintWriter outFile_statsCurrIt |
| */ |
| |
| String[] sentsCurrIt_currSrcSent = new String[sizeOfNBest + 1]; |
| |
| Vector<String> unknownCands_V = new Vector<>(); |
| // which candidates (of the i'th source sentence) have not been seen before |
| // this iteration? |
| |
| for (int n = 0; n <= sizeOfNBest; ++n) { |
| // Why up to and *including* sizeOfNBest? |
| // So that it would read the "||||||" separator even if there is |
| // a complete list of sizeOfNBest candidates. |
| |
| // for the nth candidate for the ith sentence, read the sentence, |
| // and store it in the sentsCurrIt_currSrcSent array |
| |
| sents_str = inFile_sentsCurrIt.readLine(); |
| sentsCurrIt_currSrcSent[n] = sents_str; // Note: possibly "||||||" |
| |
| if (sents_str.equals("||||||")) { |
| n = sizeOfNBest + 1; |
| } else if (!existingCandStats.containsKey(sents_str)) { |
| unknownCands_V.add(sents_str); |
| writeLine(sents_str, outFile_unknownCands); |
| outFile_unknownIndices.println(i); |
| newCandidatesAdded[iteration] += 1; |
| existingCandStats.put(sents_str, "U"); // i.e. unknown |
| // we add sents_str to avoid duplicate entries in unknownCands_V |
| } |
| |
| } // for (n) |
| |
| |
| |
| // now unknownCands_V has the candidates for which we need to calculate |
| // sufficient statistics (for the i'th source sentence) |
| int sizeUnknown = unknownCands_V.size(); |
| sizeUnknown_currIt[i] = sizeUnknown; |
| |
| /*********************************************/ |
| /* |
| * String[] unknownCands = new String[sizeUnknown]; unknownCands_V.toArray(unknownCands); |
| * int[] indices = new int[sizeUnknown]; for (int d = 0; d < sizeUnknown; ++d) { |
| * existingCandStats.remove(unknownCands[d]); // remove the (unknownCands[d],"U") entry |
| * from existingCandStats // (we had added it while constructing unknownCands_V to avoid |
| * duplicate entries) indices[d] = i; } |
| */ |
| /*********************************************/ |
| |
| existingCandStats.clear(); |
| |
| } // for (i) |
| |
| /* |
| * int[][] newSuffStats = null; if (!statsCurrIt_exists && sizeUnknown > 0) { newSuffStats = |
| * evalMetric.suffStats(unknownCands, indices); } |
| */ |
| |
| outFile_statsMergedKnown.close(); |
| outFile_unknownCands.close(); |
| outFile_unknownIndices.close(); |
| |
| |
| for (int it = firstIt; it < iteration; ++it) { |
| inFile_sents[it].close(); |
| inFile_stats[it].close(); |
| |
| InputStream inStream_sents, inStream_stats; |
| if (compressFiles == 0) { |
| inStream_sents = new FileInputStream(tmpDirPrefix + "temp.sents.it" + it); |
| inStream_stats = new FileInputStream(tmpDirPrefix + "temp.stats.it" + it); |
| } else { |
| inStream_sents = |
| new GZIPInputStream( |
| new FileInputStream(tmpDirPrefix + "temp.sents.it" + it + ".gz")); |
| inStream_stats = |
| new GZIPInputStream( |
| new FileInputStream(tmpDirPrefix + "temp.stats.it" + it + ".gz")); |
| } |
| |
| inFile_sents[it] = new BufferedReader(new InputStreamReader(inStream_sents, "utf8")); |
| inFile_stats[it] = new BufferedReader(new InputStreamReader(inStream_stats, "utf8")); |
| } |
| |
| inFile_sentsCurrIt.close(); |
| if (compressFiles == 0) { |
| inStream_sentsCurrIt = new FileInputStream(tmpDirPrefix + "temp.sents.it" + iteration); |
| } else { |
| inStream_sentsCurrIt = |
| new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.sents.it" + iteration |
| + ".gz")); |
| } |
| inFile_sentsCurrIt = |
| new BufferedReader(new InputStreamReader(inStream_sentsCurrIt, "utf8")); |
| |
| |
| |
| // calculate SS for unseen candidates and write them to file |
| FileInputStream inStream_statsCurrIt_unknown = null; |
| BufferedReader inFile_statsCurrIt_unknown = null; |
| |
| if (!statsCurrIt_exists && newCandidatesAdded[iteration] > 0) { |
| // create the file... |
| evalMetric.createSuffStatsFile(tmpDirPrefix + "temp.currIt.unknownCands", tmpDirPrefix |
| + "temp.currIt.unknownIndices", tmpDirPrefix + "temp.stats.unknown", sizeOfNBest); |
| |
| // ...and open it |
| inStream_statsCurrIt_unknown = new FileInputStream(tmpDirPrefix + "temp.stats.unknown"); |
| inFile_statsCurrIt_unknown = |
| new BufferedReader(new InputStreamReader(inStream_statsCurrIt_unknown, "utf8")); |
| } |
| |
| // OPEN mergedKnown file |
| FileInputStream instream_statsMergedKnown = |
| new FileInputStream(tmpDirPrefix + "temp.stats.mergedKnown"); |
| BufferedReader inFile_statsMergedKnown = |
| new BufferedReader(new InputStreamReader(instream_statsMergedKnown, "utf8")); |
| |
| for (int i = 0; i < numSentences; ++i) { |
| |
| // reprocess candidates from previous iterations |
| for (int it = firstIt; it < iteration; ++it) { |
| for (int n = 0; n <= sizeOfNBest; ++n) { |
| |
| sents_str = inFile_sents[it].readLine(); |
| stats_str = inFile_stats[it].readLine(); |
| |
| if (sents_str.equals("||||||")) { |
| n = sizeOfNBest + 1; |
| } else if (!existingCandStats.containsKey(sents_str)) { |
| existingCandStats.put(sents_str, stats_str); |
| } // if unseen candidate |
| |
| } // for (n) |
| } // for (it) |
| |
| // copy relevant portion from mergedKnown to the merged file |
| String line_mergedKnown = inFile_statsMergedKnown.readLine(); |
| while (!line_mergedKnown.equals("||||||")) { |
| outFile_statsMerged.println(line_mergedKnown); |
| line_mergedKnown = inFile_statsMergedKnown.readLine(); |
| } |
| |
| int[] stats = new int[suffStatsCount]; |
| |
| for (int n = 0; n <= sizeOfNBest; ++n) { |
| // Why up to and *including* sizeOfNBest? |
| // So that it would read the "||||||" separator even if there is |
| // a complete list of sizeOfNBest candidates. |
| |
| // for the nth candidate for the ith sentence, read the sentence, feature values, |
| // and sufficient statistics from the various temp files |
| |
| sents_str = inFile_sentsCurrIt.readLine(); |
| feats_str = inFile_featsCurrIt.readLine(); |
| |
| if (sents_str.equals("||||||")) { |
| n = sizeOfNBest + 1; |
| } else if (!existingCandStats.containsKey(sents_str)) { |
| |
| if (!statsCurrIt_exists) { |
| stats_str = inFile_statsCurrIt_unknown.readLine(); |
| |
| String[] temp_stats = stats_str.split("\\s+"); |
| for (int s = 0; s < suffStatsCount; ++s) { |
| stats[s] = Integer.parseInt(temp_stats[s]); |
| } |
| |
| /* |
| * stats_str = ""; for (int s = 0; s < suffStatsCount-1; ++s) { stats[s] = |
| * newSuffStats[d][s]; stats_str += (stats[s] + " "); } stats[suffStatsCount-1] = |
| * newSuffStats[d][suffStatsCount-1]; stats_str += stats[suffStatsCount-1]; |
| */ |
| |
| outFile_statsCurrIt.println(stats_str); |
| } else { |
| stats_str = inFile_statsCurrIt.readLine(); |
| String[] temp_stats = stats_str.split("\\s+"); |
| for (int s = 0; s < suffStatsCount; ++s) { |
| try { |
| stats[s] = Integer.parseInt(temp_stats[s]); |
| } catch (Exception e) { |
| stats[s] = 0; |
| } |
| } |
| } |
| |
| outFile_statsMerged.println(stats_str); |
| |
| featVal_str = feats_str.split("\\s+"); |
| |
| if (feats_str.indexOf('=') != -1) { |
| for (String featurePair: featVal_str) { |
| String[] pair = featurePair.split("="); |
| String name = pair[0]; |
| Double value = Double.parseDouble(pair[1]); |
| currFeatVal[c_fromParamName(name)] = value; |
| } |
| } else { |
| for (int c = 1; c <= numParams; ++c) { |
| try { |
| currFeatVal[c] = Double.parseDouble(featVal_str[c - 1]); |
| } catch (Exception e) { |
| // NumberFormatException, ArrayIndexOutOfBoundsException |
| currFeatVal[c] = 0.0; |
| } |
| |
| // print("fV[" + c + "]=" + currFeatVal[c] + " ",4); |
| } |
| } |
| // println("",4); |
| |
| |
| for (int j = 1; j <= initsPerIt; ++j) { |
| double score = 0; // i.e. score assigned by decoder |
| for (int c = 1; c <= numParams; ++c) { |
| score += initialLambda[j][c] * currFeatVal[c]; |
| } |
| if (score > best1Score[j][i]) { |
| best1Score[j][i] = score; |
| System.arraycopy(stats, 0, best1Cand_suffStats[j][i], 0, suffStatsCount); |
| } |
| } // for (j) |
| |
| existingCandStats.put(sents_str, stats_str); |
| |
| setFeats(featVal_array, i, lastUsedIndex, maxIndex, currFeatVal); |
| candCount[i] += 1; |
| |
| // newCandidatesAdded[iteration] += 1; |
| // moved to code above detecting new candidates |
| |
| } else { |
| if (statsCurrIt_exists) |
| inFile_statsCurrIt.readLine(); |
| else { |
| // write SS to outFile_statsCurrIt |
| stats_str = existingCandStats.get(sents_str); |
| outFile_statsCurrIt.println(stats_str); |
| } |
| } |
| |
| } // for (n) |
| |
| // now d = sizeUnknown_currIt[i] - 1 |
| |
| if (statsCurrIt_exists) |
| inFile_statsCurrIt.readLine(); |
| else |
| outFile_statsCurrIt.println("||||||"); |
| |
| existingCandStats.clear(); |
| totalCandidateCount += candCount[i]; |
| |
| if ((i + 1) % 500 == 0) { |
| print((i + 1) + "\n" + " ", 1); |
| } else if ((i + 1) % 100 == 0) { |
| print("+", 1); |
| } else if ((i + 1) % 25 == 0) { |
| print(".", 1); |
| } |
| |
| } // for (i) |
| |
| inFile_statsMergedKnown.close(); |
| outFile_statsMerged.close(); |
| |
| println("", 1); // finish progress line |
| |
| for (int it = firstIt; it < iteration; ++it) { |
| inFile_sents[it].close(); |
| inFile_feats[it].close(); |
| inFile_stats[it].close(); |
| } |
| |
| inFile_sentsCurrIt.close(); |
| inFile_featsCurrIt.close(); |
| if (statsCurrIt_exists) |
| inFile_statsCurrIt.close(); |
| else |
| outFile_statsCurrIt.close(); |
| |
| if (compressFiles == 1 && !statsCurrIt_exists) { |
| gzipFile(tmpDirPrefix + "temp.stats.it" + iteration); |
| } |
| |
| deleteFile(tmpDirPrefix + "temp.currIt.unknownCands"); |
| deleteFile(tmpDirPrefix + "temp.currIt.unknownIndices"); |
| deleteFile(tmpDirPrefix + "temp.stats.unknown"); |
| deleteFile(tmpDirPrefix + "temp.stats.mergedKnown"); |
| |
| // cleanupMemory(); |
| |
| println("Processed " + totalCandidateCount + " distinct candidates " + "(about " |
| + totalCandidateCount / numSentences + " per sentence):", 1); |
| for (int it = firstIt; it <= iteration; ++it) { |
| println("newCandidatesAdded[it=" + it + "] = " + newCandidatesAdded[it] + " (about " |
| + newCandidatesAdded[it] / numSentences + " per sentence)", 1); |
| } |
| |
| println("", 1); |
| |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| |
| |
| if (newCandidatesAdded[iteration] == 0) { |
| if (!oneModificationPerIteration) { |
| println("No new candidates added in this iteration; exiting Z-MERT.", 1); |
| println("", 1); |
| println("--- Z-MERT iteration #" + iteration + " ending @ " + (new Date()) + " ---", 1); |
| println("", 1); |
| return null; // THIS MEANS THAT THE OLD VALUES SHOULD BE KEPT BY THE CALLER |
| } else { |
| println("Note: No new candidates added in this iteration.", 1); |
| } |
| } |
| |
| // run the initsPerIt optimizations, in parallel, across numOptThreads threads |
| ExecutorService pool = Executors.newFixedThreadPool(numOptThreads); |
| Semaphore blocker = new Semaphore(0); |
| Vector<String>[] threadOutput = new Vector[initsPerIt + 1]; |
| |
| for (int j = 1; j <= initsPerIt; ++j) { |
| threadOutput[j] = new Vector<>(); |
| pool.execute(new IntermediateOptimizer(j, blocker, threadOutput[j], initialLambda[j], |
| finalLambda[j], best1Cand_suffStats[j], finalScore, candCount, featVal_array, |
| suffStats_array)); |
| } |
| |
| pool.shutdown(); |
| |
| try { |
| blocker.acquire(initsPerIt); |
| } catch (java.lang.InterruptedException e) { |
| throw new RuntimeException(e); |
| } |
| |
| // extract output from threadOutput[] |
| for (int j = 1; j <= initsPerIt; ++j) { |
| // no verbosity check needed; thread already checked |
| (threadOutput[j]).forEach(this::println); |
| } |
| |
| int best_j = 1; |
| double bestFinalScore = finalScore[1]; |
| for (int j = 2; j <= initsPerIt; ++j) { |
| if (evalMetric.isBetter(finalScore[j], bestFinalScore)) { |
| best_j = j; |
| bestFinalScore = finalScore[j]; |
| } |
| } |
| |
| if (initsPerIt > 1) { |
| println("Best final lambda is lambda[j=" + best_j + "] " + "(" + metricName_display + ": " |
| + f4.format(bestFinalScore) + ").", 1); |
| println("", 1); |
| } |
| |
| FINAL_score = bestFinalScore; |
| |
| boolean anyParamChanged = false; |
| boolean anyParamChangedSignificantly = false; |
| |
| for (int c = 1; c <= numParams; ++c) { |
| if (finalLambda[best_j][c] != lambda[c]) { |
| anyParamChanged = true; |
| } |
| if (Math.abs(finalLambda[best_j][c] - lambda[c]) > stopSigValue) { |
| anyParamChangedSignificantly = true; |
| } |
| } |
| |
| System.arraycopy(finalLambda[best_j], 1, lambda, 1, numParams); |
| println("--- Z-MERT iteration #" + iteration + " ending @ " + (new Date()) + " ---", 1); |
| println("", 1); |
| |
| if (!anyParamChanged) { |
| println("No parameter value changed in this iteration; exiting Z-MERT.", 1); |
| println("", 1); |
| break; // exit for (iteration) loop preemptively |
| } |
| |
| // check if a lambda is outside its threshold range |
| for (int c = 1; c <= numParams; ++c) { |
| if (lambda[c] < minThValue[c] || lambda[c] > maxThValue[c]) { |
| println("Warning: after normalization, lambda[" + c + "]=" + f4.format(lambda[c]) |
| + " is outside its critical value range.", 1); |
| } |
| } |
| |
| // was an early stopping criterion satisfied? |
| boolean critSatisfied = false; |
| if (!anyParamChangedSignificantly && stopSigValue >= 0) { |
| println("Note: No parameter value changed significantly " + "(i.e. by more than " |
| + stopSigValue + ") in this iteration.", 1); |
| critSatisfied = true; |
| } |
| |
| if (critSatisfied) { |
| ++earlyStop; |
| println("", 1); |
| } else { |
| earlyStop = 0; |
| } |
| |
| // if min number of iterations executed, investigate if early exit should happen |
| if (iteration >= minIts && earlyStop >= stopMinIts) { |
| println("Some early stopping criteria has been observed " + "in " + stopMinIts |
| + " consecutive iterations; exiting Z-MERT.", 1); |
| println("", 1); |
| break; // exit for (iteration) loop preemptively |
| } |
| |
| // if max number of iterations executed, exit |
| if (iteration >= maxIts) { |
| println("Maximum number of MERT iterations reached; exiting Z-MERT.", 1); |
| println("", 1); |
| break; // exit for (iteration) loop |
| } |
| |
| println("Next iteration will decode with lambda: " + lambdaToString(lambda), 1); |
| println("", 1); |
| |
| // printMemoryUsage(); |
| for (int i = 0; i < numSentences; ++i) { |
| suffStats_array[i].clear(); |
| } |
| // cleanupMemory(); |
| // println("",2); |
| |
| |
| retA[2] = 0; // i.e. this should NOT be the last iteration |
| done = true; |
| |
| } // while (!done) // NOTE: this "loop" will only be carried out once |
| |
| |
| // delete .temp.stats.merged file, since it is not needed in the next |
| // iteration (it will be recreated from scratch) |
| deleteFile(tmpDirPrefix + "temp.stats.merged"); |
| |
| retA[0] = FINAL_score; |
| retA[1] = earlyStop; |
| return retA; |
| |
| } // run_single_iteration |
| |
| private String lambdaToString(double[] lambdaA) { |
| String retStr = "{"; |
| for (int c = 1; c <= numParams - 1; ++c) { |
| retStr += "" + lambdaA[c] + ", "; |
| } |
| retStr += "" + lambdaA[numParams] + "}"; |
| |
| return retStr; |
| } |
| |
| private String[] run_decoder(int iteration) { |
| String[] retSA = new String[2]; |
| // [0] name of file to be processed |
| // [1] indicates how the output file was obtained: |
| // 1: external decoder |
| // 2: fake decoder |
| // 3: internal decoder |
| |
| if (fakeFileNameTemplate != null |
| && fileExists(fakeFileNamePrefix + iteration + fakeFileNameSuffix)) { |
| String fakeFileName = fakeFileNamePrefix + iteration + fakeFileNameSuffix; |
| println("Not running decoder; using " + fakeFileName + " instead.", 1); |
| /* |
| * if (fakeFileName.endsWith(".gz")) { copyFile(fakeFileName,decoderOutFileName+".gz"); |
| * gunzipFile(decoderOutFileName+".gz"); } else { copyFile(fakeFileName,decoderOutFileName); } |
| */ |
| retSA[0] = fakeFileName; |
| retSA[1] = "2"; |
| |
| } else { |
| println("Running external decoder...", 1); |
| |
| try { |
| ArrayList<String> cmd = new ArrayList<>(); |
| cmd.add(decoderCommandFileName); |
| |
| if (passIterationToDecoder) |
| cmd.add(Integer.toString(iteration)); |
| |
| ProcessBuilder pb = new ProcessBuilder(cmd); |
| // this merges the error and output streams of the subprocess |
| pb.redirectErrorStream(true); |
| Process p = pb.start(); |
| |
| // capture the sub-command's output |
| StreamGobbler outputGobbler = new StreamGobbler(p.getInputStream(), decVerbosity); |
| outputGobbler.start(); |
| |
| int decStatus = p.waitFor(); |
| if (decStatus != validDecoderExitValue) { |
| throw new RuntimeException("Call to decoder returned " + decStatus + "; was expecting " |
| + validDecoderExitValue + "."); |
| } |
| } catch (IOException| InterruptedException e) { |
| throw new RuntimeException(e); |
| } |
| |
| retSA[0] = decoderOutFileName; |
| retSA[1] = "1"; |
| } |
| |
| return retSA; |
| |
| } |
| |
| private void produceTempFiles(String nbestFileName, int iteration) { |
| try { |
| String sentsFileName = tmpDirPrefix + "temp.sents.it" + iteration; |
| String featsFileName = tmpDirPrefix + "temp.feats.it" + iteration; |
| |
| FileOutputStream outStream_sents = new FileOutputStream(sentsFileName, false); |
| OutputStreamWriter outStreamWriter_sents = new OutputStreamWriter(outStream_sents, "utf8"); |
| BufferedWriter outFile_sents = new BufferedWriter(outStreamWriter_sents); |
| |
| PrintWriter outFile_feats = new PrintWriter(featsFileName); |
| |
| |
| InputStream inStream_nbest = null; |
| if (nbestFileName.endsWith(".gz")) { |
| inStream_nbest = new GZIPInputStream(new FileInputStream(nbestFileName)); |
| } else { |
| inStream_nbest = new FileInputStream(nbestFileName); |
| } |
| BufferedReader inFile_nbest = |
| new BufferedReader(new InputStreamReader(inStream_nbest, "utf8")); |
| |
| String line; // , prevLine; |
| String candidate_str = ""; |
| String feats_str = ""; |
| |
| int i = 0; |
| int n = 0; |
| line = inFile_nbest.readLine(); |
| |
| while (line != null) { |
| |
| // skip blank lines |
| if (line.equals("")) continue; |
| |
| // skip lines that aren't formatted correctly |
| if (!line.contains("|||")) |
| continue; |
| |
| /* |
| * line format: |
| * |
| * i ||| words of candidate translation . ||| feat-1_val feat-2_val ... feat-numParams_val |
| * .* |
| * |
| * Updated September 2012: features can now be named (for sparse feature compatibility). |
| * You must name all features or none of them. |
| */ |
| |
| // in a well formed file, we'd find the nth candidate for the ith sentence |
| |
| int read_i = Integer.parseInt((line.substring(0, line.indexOf("|||"))).trim()); |
| |
| if (read_i != i) { |
| writeLine("||||||", outFile_sents); |
| outFile_feats.println("||||||"); |
| n = 0; |
| ++i; |
| } |
| |
| line = (line.substring(line.indexOf("|||") + 3)).trim(); // get rid of initial text |
| |
| candidate_str = (line.substring(0, line.indexOf("|||"))).trim(); |
| feats_str = (line.substring(line.indexOf("|||") + 3)).trim(); |
| // get rid of candidate string |
| |
| int junk_i = feats_str.indexOf("|||"); |
| if (junk_i >= 0) { |
| feats_str = (feats_str.substring(0, junk_i)).trim(); |
| } |
| |
| writeLine(normalize(candidate_str, textNormMethod), outFile_sents); |
| outFile_feats.println(feats_str); |
| |
| ++n; |
| if (n == sizeOfNBest) { |
| writeLine("||||||", outFile_sents); |
| outFile_feats.println("||||||"); |
| n = 0; |
| ++i; |
| } |
| |
| line = inFile_nbest.readLine(); |
| } |
| |
| if (i != numSentences) { // last sentence had too few candidates |
| writeLine("||||||", outFile_sents); |
| outFile_feats.println("||||||"); |
| } |
| |
| inFile_nbest.close(); |
| outFile_sents.close(); |
| outFile_feats.close(); |
| |
| if (compressFiles == 1) { |
| gzipFile(sentsFileName); |
| gzipFile(featsFileName); |
| } |
| |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| |
| } |
| |
| private void createConfigFile(double[] params, String cfgFileName, String templateFileName) { |
| try { |
| // i.e. create cfgFileName, which is similar to templateFileName, but with |
| // params[] as parameter values |
| |
| BufferedReader inFile = new BufferedReader(new FileReader(templateFileName)); |
| PrintWriter outFile = new PrintWriter(cfgFileName); |
| |
| String line = inFile.readLine(); |
| |
| while (line != null) { |
| int c_match = -1; |
| for (int c = 1; c <= numParams; ++c) { |
| if (line.startsWith(paramNames[c] + " ")) { |
| c_match = c; |
| break; |
| } |
| } |
| |
| if (c_match == -1) { |
| outFile.println(line); |
| } else { |
| outFile.println(paramNames[c_match] + " " + params[c_match]); |
| } |
| |
| line = inFile.readLine(); |
| } |
| |
| inFile.close(); |
| outFile.close(); |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| |
| private void processParamFile() { |
| // process parameter file |
| Scanner inFile_init = null; |
| try { |
| inFile_init = new Scanner(new FileReader(paramsFileName)); |
| } catch (FileNotFoundException e) { |
| throw new RuntimeException("FileNotFoundException in MertCore.processParamFile(): " + e.getMessage()); |
| } |
| |
| String dummy = ""; |
| |
| // initialize lambda[] and other related arrays |
| for (int c = 1; c <= numParams; ++c) { |
| // skip parameter name |
| while (!dummy.equals("|||")) { |
| dummy = inFile_init.next(); |
| } |
| |
| // read default value |
| lambda[c] = inFile_init.nextDouble(); |
| defaultLambda[c] = lambda[c]; |
| |
| // read isOptimizable |
| dummy = inFile_init.next(); |
| switch (dummy) { |
| case "Opt": |
| isOptimizable[c] = true; |
| break; |
| case "Fix": |
| isOptimizable[c] = false; |
| break; |
| default: |
| throw new RuntimeException( |
| "Unknown isOptimizable string " + dummy + " (must be either Opt or Fix)"); |
| } |
| |
| if (!isOptimizable[c]) { // skip next four values |
| dummy = inFile_init.next(); |
| dummy = inFile_init.next(); |
| dummy = inFile_init.next(); |
| dummy = inFile_init.next(); |
| } else { |
| // set minThValue[c] and maxThValue[c] (range for thresholds to investigate) |
| dummy = inFile_init.next(); |
| switch (dummy) { |
| case "-Inf": |
| minThValue[c] = NegInf; |
| break; |
| case "+Inf": |
| throw new RuntimeException("minThValue[" + c + "] cannot be +Inf!"); |
| default: |
| minThValue[c] = Double.parseDouble(dummy); |
| break; |
| } |
| |
| dummy = inFile_init.next(); |
| switch (dummy) { |
| case "-Inf": |
| throw new RuntimeException("maxThValue[" + c + "] cannot be -Inf!"); |
| case "+Inf": |
| maxThValue[c] = PosInf; |
| break; |
| default: |
| maxThValue[c] = Double.parseDouble(dummy); |
| break; |
| } |
| |
| // set minRandValue[c] and maxRandValue[c] (range for random values) |
| dummy = inFile_init.next(); |
| if (dummy.equals("-Inf") || dummy.equals("+Inf")) { |
| throw new RuntimeException("minRandValue[" + c + "] cannot be -Inf or +Inf!"); |
| } else { |
| minRandValue[c] = Double.parseDouble(dummy); |
| } |
| |
| dummy = inFile_init.next(); |
| if (dummy.equals("-Inf") || dummy.equals("+Inf")) { |
| throw new RuntimeException("maxRandValue[" + c + "] cannot be -Inf or +Inf!"); |
| } else { |
| maxRandValue[c] = Double.parseDouble(dummy); |
| } |
| |
| |
| // check for illogical values |
| if (minThValue[c] > maxThValue[c]) { |
| throw new RuntimeException("minThValue[" + c + "]=" + minThValue[c] |
| + " > " + maxThValue[c] + "=maxThValue[" + c + "]!"); |
| } |
| if (minRandValue[c] > maxRandValue[c]) { |
| throw new RuntimeException("minRandValue[" + c + "]=" + minRandValue[c] |
| + " > " + maxRandValue[c] + "=maxRandValue[" + c + "]!"); |
| } |
| |
| // check for odd values |
| if (!(minThValue[c] <= lambda[c] && lambda[c] <= maxThValue[c])) { |
| println("Warning: lambda[" + c + "] has initial value (" + lambda[c] + ")", 1); |
| println(" that is outside its critical value range " + "[" + minThValue[c] + "," |
| + maxThValue[c] + "]", 1); |
| } |
| |
| if (minThValue[c] == maxThValue[c]) { |
| println("Warning: lambda[" + c + "] has " + "minThValue = maxThValue = " + minThValue[c] |
| + ".", 1); |
| } |
| |
| if (minRandValue[c] == maxRandValue[c]) { |
| println("Warning: lambda[" + c + "] has " + "minRandValue = maxRandValue = " |
| + minRandValue[c] + ".", 1); |
| } |
| |
| if (minRandValue[c] < minThValue[c] || minRandValue[c] > maxThValue[c] |
| || maxRandValue[c] < minThValue[c] || maxRandValue[c] > maxThValue[c]) { |
| println("Warning: The random value range for lambda[" + c + "] is not contained", 1); |
| println(" within its critical value range.", 1); |
| } |
| |
| } // if (!isOptimizable[c]) |
| |
| /* |
| * precision[c] = inFile_init.nextDouble(); if (precision[c] < 0) { println("precision[" + c + |
| * "]=" + precision[c] + " < 0! Must be non-negative."); System.exit(21); } |
| */ |
| |
| } |
| |
| // set normalizationOptions[] |
| String origLine = ""; |
| while (origLine != null && origLine.length() == 0) { |
| origLine = inFile_init.nextLine(); |
| } |
| |
| |
| // How should a lambda[] vector be normalized (before decoding)? |
| // nO[0] = 0: no normalization |
| // nO[0] = 1: scale so that parameter nO[2] has absolute value nO[1] |
| // nO[0] = 2: scale so that the maximum absolute value is nO[1] |
| // nO[0] = 3: scale so that the minimum absolute value is nO[1] |
| // nO[0] = 4: scale so that the L-nO[1] norm equals nO[2] |
| |
| // normalization = none |
| // normalization = absval 1 lm |
| // normalization = maxabsval 1 |
| // normalization = minabsval 1 |
| // normalization = LNorm 2 1 |
| |
| dummy = (origLine.substring(origLine.indexOf("=") + 1)).trim(); |
| String[] dummyA = dummy.split("\\s+"); |
| |
| switch (dummyA[0]) { |
| case "none": |
| normalizationOptions[0] = 0; |
| break; |
| case "absval": |
| normalizationOptions[0] = 1; |
| normalizationOptions[1] = Double.parseDouble(dummyA[1]); |
| String pName = dummyA[2]; |
| for (int i = 3; i < dummyA.length; ++i) { // in case parameter name has multiple words |
| pName = pName + " " + dummyA[i]; |
| } |
| normalizationOptions[2] = c_fromParamName(pName); |
| |
| if (normalizationOptions[1] <= 0) { |
| throw new RuntimeException("Value for the absval normalization method must be positive."); |
| } |
| if (normalizationOptions[2] == 0) { |
| throw new RuntimeException("Unrecognized feature name " + normalizationOptions[2] |
| + " for absval normalization method."); |
| } |
| break; |
| case "maxabsval": |
| normalizationOptions[0] = 2; |
| normalizationOptions[1] = Double.parseDouble(dummyA[1]); |
| if (normalizationOptions[1] <= 0) { |
| throw new RuntimeException( |
| "Value for the maxabsval normalization method must be positive."); |
| } |
| break; |
| case "minabsval": |
| normalizationOptions[0] = 3; |
| normalizationOptions[1] = Double.parseDouble(dummyA[1]); |
| if (normalizationOptions[1] <= 0) { |
| throw new RuntimeException( |
| "Value for the minabsval normalization method must be positive."); |
| } |
| break; |
| case "LNorm": |
| normalizationOptions[0] = 4; |
| normalizationOptions[1] = Double.parseDouble(dummyA[1]); |
| normalizationOptions[2] = Double.parseDouble(dummyA[2]); |
| if (normalizationOptions[1] <= 0 || normalizationOptions[2] <= 0) { |
| throw new RuntimeException( |
| "Both values for the LNorm normalization method must be positive."); |
| } |
| break; |
| default: |
| throw new RuntimeException("Unrecognized normalization method " + dummyA[0] + "; " |
| + "must be one of none, absval, maxabsval, and LNorm."); |
| } |
| |
| inFile_init.close(); |
| } |
| |
| private void processDocInfo() { |
| // sets numDocuments and docOfSentence[] |
| docOfSentence = new int[numSentences]; |
| |
| if (docInfoFileName == null) { |
| for (int i = 0; i < numSentences; ++i) |
| docOfSentence[i] = 0; |
| numDocuments = 1; |
| } else { |
| |
| try { |
| |
| // 4 possible formats: |
| // 1) List of numbers, one per document, indicating # sentences in each document. |
| // 2) List of "docName size" pairs, one per document, indicating name of document and # |
| // sentences. |
| // 3) List of docName's, one per sentence, indicating which doument each sentence belongs |
| // to. |
| // 4) List of docName_number's, one per sentence, indicating which doument each sentence |
| // belongs to, |
| // and its order in that document. (can also use '-' instead of '_') |
| |
| int docInfoSize = new ExistingUTF8EncodedTextFile(docInfoFileName).getNumberOfNonEmptyLines(); |
| |
| if (docInfoSize < numSentences) { // format #1 or #2 |
| numDocuments = docInfoSize; |
| int i = 0; |
| |
| BufferedReader inFile = new BufferedReader(new FileReader(docInfoFileName)); |
| String line = inFile.readLine(); |
| boolean format1 = (!(line.contains(" "))); |
| |
| for (int doc = 0; doc < numDocuments; ++doc) { |
| |
| if (doc != 0) line = inFile.readLine(); |
| |
| int docSize = 0; |
| if (format1) { |
| docSize = Integer.parseInt(line); |
| } else { |
| docSize = Integer.parseInt(line.split("\\s+")[1]); |
| } |
| |
| for (int i2 = 1; i2 <= docSize; ++i2) { |
| docOfSentence[i] = doc; |
| ++i; |
| } |
| |
| } |
| |
| // now i == numSentences |
| |
| inFile.close(); |
| |
| } else if (docInfoSize == numSentences) { // format #3 or #4 |
| |
| boolean format3 = false; |
| |
| HashSet<String> seenStrings = new HashSet<>(); |
| BufferedReader inFile = new BufferedReader(new FileReader(docInfoFileName)); |
| for (int i = 0; i < numSentences; ++i) { |
| // set format3 = true if a duplicate is found |
| String line = inFile.readLine(); |
| if (seenStrings.contains(line)) format3 = true; |
| seenStrings.add(line); |
| } |
| |
| inFile.close(); |
| |
| HashSet<String> seenDocNames = new HashSet<>(); |
| HashMap<String, Integer> docOrder = new HashMap<>(); |
| // maps a document name to the order (0-indexed) in which it was seen |
| |
| inFile = new BufferedReader(new FileReader(docInfoFileName)); |
| for (int i = 0; i < numSentences; ++i) { |
| String line = inFile.readLine(); |
| |
| String docName = ""; |
| if (format3) { |
| docName = line; |
| } else { |
| int sep_i = Math.max(line.lastIndexOf('_'), line.lastIndexOf('-')); |
| docName = line.substring(0, sep_i); |
| } |
| |
| if (!seenDocNames.contains(docName)) { |
| seenDocNames.add(docName); |
| docOrder.put(docName, seenDocNames.size() - 1); |
| } |
| |
| int docOrder_i = docOrder.get(docName); |
| |
| docOfSentence[i] = docOrder_i; |
| |
| } |
| |
| inFile.close(); |
| |
| numDocuments = seenDocNames.size(); |
| |
| } else { // badly formatted |
| |
| } |
| |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| |
| } |
| |
| private boolean copyFile(String origFileName, String newFileName) { |
| try { |
| File inputFile = new File(origFileName); |
| File outputFile = new File(newFileName); |
| |
| InputStream in = new FileInputStream(inputFile); |
| OutputStream out = new FileOutputStream(outputFile); |
| |
| byte[] buffer = new byte[1024]; |
| int len; |
| while ((len = in.read(buffer)) > 0) { |
| out.write(buffer, 0, len); |
| } |
| in.close(); |
| out.close(); |
| |
| /* |
| * InputStream inStream = new FileInputStream(new File(origFileName)); BufferedReader inFile = |
| * new BufferedReader(new InputStreamReader(inStream, "utf8")); |
| * |
| * FileOutputStream outStream = new FileOutputStream(newFileName, false); OutputStreamWriter |
| * outStreamWriter = new OutputStreamWriter(outStream, "utf8"); BufferedWriter outFile = new |
| * BufferedWriter(outStreamWriter); |
| * |
| * String line; while(inFile.ready()) { line = inFile.readLine(); writeLine(line, outFile); } |
| * |
| * inFile.close(); outFile.close(); |
| */ |
| return true; |
| } catch (IOException e) { |
| LOG.error(e.getMessage(), e); |
| return false; |
| } |
| } |
| |
| private void renameFile(String origFileName, String newFileName) { |
| if (fileExists(origFileName)) { |
| deleteFile(newFileName); |
| File oldFile = new File(origFileName); |
| File newFile = new File(newFileName); |
| if (!oldFile.renameTo(newFile)) { |
| println("Warning: attempt to rename " + origFileName + " to " + newFileName |
| + " was unsuccessful!", 1); |
| } |
| } else { |
| println("Warning: file " + origFileName + " does not exist! (in MertCore.renameFile)", 1); |
| } |
| } |
| |
| private void deleteFile(String fileName) { |
| if (fileExists(fileName)) { |
| File fd = new File(fileName); |
| if (!fd.delete()) { |
| println("Warning: attempt to delete " + fileName + " was unsuccessful!", 1); |
| } |
| } |
| } |
| |
| private void writeLine(String line, BufferedWriter writer) throws IOException { |
| writer.write(line, 0, line.length()); |
| writer.newLine(); |
| writer.flush(); |
| } |
| |
| public void finish() { |
| if (myDecoder != null) { |
| myDecoder.cleanUp(); |
| } |
| |
| // create config file with final values |
| createConfigFile(lambda, decoderConfigFileName + ".ZMERT.final", decoderConfigFileName |
| + ".ZMERT.orig"); |
| |
| // delete current decoder config file and decoder output |
| deleteFile(decoderConfigFileName); |
| deleteFile(decoderOutFileName); |
| |
| // restore original name for config file (name was changed |
| // in initialize() so it doesn't get overwritten) |
| renameFile(decoderConfigFileName + ".ZMERT.orig", decoderConfigFileName); |
| |
| if (finalLambdaFileName != null) { |
| try { |
| PrintWriter outFile_lambdas = new PrintWriter(finalLambdaFileName); |
| for (int c = 1; c <= numParams; ++c) { |
| outFile_lambdas.println(paramNames[c] + " ||| " + lambda[c]); |
| } |
| outFile_lambdas.close(); |
| |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| |
| } |
| |
| private String[] cfgFileToArgsArray(String fileName) { |
| checkFile(fileName); |
| |
| Vector<String> argsVector = new Vector<>(); |
| |
| BufferedReader inFile = null; |
| try { |
| inFile = new BufferedReader(new FileReader(fileName)); |
| String line, origLine; |
| do { |
| line = inFile.readLine(); |
| origLine = line; // for error reporting purposes |
| |
| if (line != null && line.length() > 0 && line.charAt(0) != '#') { |
| |
| if (line.contains("#")) { // discard comment |
| line = line.substring(0, line.indexOf("#")); |
| } |
| |
| line = line.trim(); |
| |
| // now line should look like "-xxx XXX" |
| |
| String[] paramA = line.split("\\s+"); |
| |
| if (paramA.length == 2 && paramA[0].charAt(0) == '-') { |
| argsVector.add(paramA[0]); |
| argsVector.add(paramA[1]); |
| } else if (paramA.length > 2 |
| && (paramA[0].equals("-m") || paramA[0].equals("-docSet") || paramA[0] |
| .equals("-damianos"))) { |
| // -m (metricName), -docSet, and -damianos are allowed to have extra optinos |
| Collections.addAll(argsVector, paramA); |
| } else { |
| println("Malformed line in config file:"); |
| println(origLine); |
| System.exit(70); |
| } |
| |
| } |
| } while (line != null); |
| |
| inFile.close(); |
| } catch (FileNotFoundException e) { |
| throw new RuntimeException("Z-MERT configuration file " + fileName + " was not found!", e); |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| |
| String[] argsArray = new String[argsVector.size()]; |
| |
| for (int i = 0; i < argsVector.size(); ++i) { |
| argsArray[i] = argsVector.elementAt(i); |
| } |
| |
| return argsArray; |
| } |
| |
| private void processArgsArray(String[] args) { |
| processArgsArray(args, true); |
| } |
| |
| private void processArgsArray(String[] args, boolean firstTime) { |
| /* set default values */ |
| // Relevant files |
| dirPrefix = null; |
| sourceFileName = null; |
| refFileName = "reference.txt"; |
| refsPerSen = 1; |
| textNormMethod = 1; |
| paramsFileName = "params.txt"; |
| docInfoFileName = null; |
| finalLambdaFileName = null; |
| // MERT specs |
| metricName = "BLEU"; |
| metricName_display = metricName; |
| metricOptions = new String[2]; |
| metricOptions[0] = "4"; |
| metricOptions[1] = "closest"; |
| docSubsetInfo = new int[7]; |
| docSubsetInfo[0] = 0; |
| maxMERTIterations = 20; |
| prevMERTIterations = 20; |
| minMERTIterations = 5; |
| stopMinIts = 3; |
| stopSigValue = -1; |
| // |
| // /* possibly other early stopping criteria here */ |
| // |
| numOptThreads = 1; |
| saveInterFiles = 3; |
| compressFiles = 0; |
| initsPerIt = 20; |
| oneModificationPerIteration = false; |
| randInit = false; |
| seed = System.currentTimeMillis(); |
| // useDisk = 2; |
| // Decoder specs |
| decoderCommandFileName = null; |
| passIterationToDecoder = false; |
| decoderOutFileName = "output.nbest"; |
| validDecoderExitValue = 0; |
| decoderConfigFileName = "dec_cfg.txt"; |
| sizeOfNBest = 100; |
| fakeFileNameTemplate = null; |
| fakeFileNamePrefix = null; |
| fakeFileNameSuffix = null; |
| // Output specs |
| verbosity = 1; |
| decVerbosity = 1; |
| |
| damianos_method = 0; |
| damianos_param = 0.0; |
| damianos_mult = 0.0; |
| |
| int i = 0; |
| |
| while (i < args.length) { |
| String option = args[i]; |
| // Relevant files |
| switch (option) { |
| case "-dir": |
| dirPrefix = args[i + 1]; |
| break; |
| case "-s": |
| sourceFileName = args[i + 1]; |
| break; |
| case "-r": |
| refFileName = args[i + 1]; |
| break; |
| case "-rps": |
| refsPerSen = Integer.parseInt(args[i + 1]); |
| if (refsPerSen < 1) { |
| throw new RuntimeException("refsPerSen must be positive."); |
| } |
| break; |
| case "-txtNrm": |
| textNormMethod = Integer.parseInt(args[i + 1]); |
| if (textNormMethod < 0 || textNormMethod > 4) { |
| throw new RuntimeException("textNormMethod should be between 0 and 4"); |
| } |
| break; |
| case "-p": |
| paramsFileName = args[i + 1]; |
| break; |
| case "-docInfo": |
| docInfoFileName = args[i + 1]; |
| break; |
| case "-fin": |
| finalLambdaFileName = args[i + 1]; |
| // MERT specs |
| break; |
| case "-m": |
| metricName = args[i + 1]; |
| metricName_display = metricName; |
| if (EvaluationMetric.knownMetricName(metricName)) { |
| int optionCount = EvaluationMetric.metricOptionCount(metricName); |
| metricOptions = new String[optionCount]; |
| for (int opt = 0; opt < optionCount; ++opt) { |
| metricOptions[opt] = args[i + opt + 2]; |
| } |
| i += optionCount; |
| } else { |
| throw new RuntimeException("Unknown metric name " + metricName + "."); |
| } |
| break; |
| case "-docSet": |
| String method = args[i + 1]; |
| |
| if (method.equals("all")) { |
| docSubsetInfo[0] = 0; |
| i += 0; |
| } else if (method.equals("bottom")) { |
| String a = args[i + 2]; |
| if (a.endsWith("d")) { |
| docSubsetInfo[0] = 1; |
| a = a.substring(0, a.indexOf("d")); |
| } else { |
| docSubsetInfo[0] = 2; |
| a = a.substring(0, a.indexOf("%")); |
| } |
| docSubsetInfo[5] = Integer.parseInt(a); |
| i += 1; |
| } else if (method.equals("top")) { |
| String a = args[i + 2]; |
| if (a.endsWith("d")) { |
| docSubsetInfo[0] = 3; |
| a = a.substring(0, a.indexOf("d")); |
| } else { |
| docSubsetInfo[0] = 4; |
| a = a.substring(0, a.indexOf("%")); |
| } |
| docSubsetInfo[5] = Integer.parseInt(a); |
| i += 1; |
| } else if (method.equals("window")) { |
| String a1 = args[i + 2]; |
| a1 = a1.substring(0, a1.indexOf("d")); // size of window |
| String a2 = args[i + 4]; |
| if (a2.indexOf("p") > 0) { |
| docSubsetInfo[0] = 5; |
| a2 = a2.substring(0, a2.indexOf("p")); |
| } else { |
| docSubsetInfo[0] = 6; |
| a2 = a2.substring(0, a2.indexOf("r")); |
| } |
| docSubsetInfo[5] = Integer.parseInt(a1); |
| docSubsetInfo[6] = Integer.parseInt(a2); |
| i += 3; |
| } else { |
| throw new RuntimeException("Unknown docSet method " + method + "."); |
| } |
| break; |
| case "-maxIt": |
| maxMERTIterations = Integer.parseInt(args[i + 1]); |
| if (maxMERTIterations < 1) { |
| throw new RuntimeException("maxMERTIts must be positive."); |
| } |
| break; |
| case "-minIt": |
| minMERTIterations = Integer.parseInt(args[i + 1]); |
| if (minMERTIterations < 1) { |
| throw new RuntimeException("minMERTIts must be positive."); |
| } |
| break; |
| case "-prevIt": |
| prevMERTIterations = Integer.parseInt(args[i + 1]); |
| if (prevMERTIterations < 0) { |
| throw new RuntimeException("prevMERTIts must be non-negative."); |
| } |
| break; |
| case "-stopIt": |
| stopMinIts = Integer.parseInt(args[i + 1]); |
| if (stopMinIts < 1) { |
| throw new RuntimeException("stopMinIts must be positive."); |
| } |
| break; |
| case "-stopSig": |
| stopSigValue = Double.parseDouble(args[i + 1]); |
| break; |
| // |
| // /* possibly other early stopping criteria here */ |
| // |
| case "-thrCnt": |
| numOptThreads = Integer.parseInt(args[i + 1]); |
| if (numOptThreads < 1) { |
| throw new RuntimeException("threadCount must be positive."); |
| } |
| break; |
| case "-save": |
| saveInterFiles = Integer.parseInt(args[i + 1]); |
| if (saveInterFiles < 0 || saveInterFiles > 3) { |
| throw new RuntimeException("save should be between 0 and 3"); |
| } |
| break; |
| case "-compress": |
| compressFiles = Integer.parseInt(args[i + 1]); |
| if (compressFiles < 0 || compressFiles > 1) { |
| throw new RuntimeException("compressFiles should be either 0 or 1"); |
| } |
| break; |
| case "-ipi": |
| initsPerIt = Integer.parseInt(args[i + 1]); |
| if (initsPerIt < 1) { |
| throw new RuntimeException("initsPerIt must be positive."); |
| } |
| break; |
| case "-opi": |
| int opi = Integer.parseInt(args[i + 1]); |
| if (opi == 1) { |
| oneModificationPerIteration = true; |
| } else if (opi == 0) { |
| oneModificationPerIteration = false; |
| } else { |
| throw new RuntimeException("oncePerIt must be either 0 or 1."); |
| } |
| break; |
| case "-rand": |
| int rand = Integer.parseInt(args[i + 1]); |
| if (rand == 1) { |
| randInit = true; |
| } else if (rand == 0) { |
| randInit = false; |
| } else { |
| throw new RuntimeException("randInit must be either 0 or 1."); |
| } |
| break; |
| case "-seed": |
| if (args[i + 1].equals("time")) { |
| seed = System.currentTimeMillis(); |
| } else { |
| seed = Long.parseLong(args[i + 1]); |
| } |
| break; |
| /* |
| * else if (option.equals("-ud")) { useDisk = Integer.parseInt(args[i+1]); if (useDisk < 0 || |
| * useDisk > 2) { println("useDisk should be between 0 and 2"); System.exit(10); } } |
| */ |
| // Decoder specs |
| case "-cmd": |
| decoderCommandFileName = args[i + 1]; |
| break; |
| case "-passIt": |
| int val = Integer.parseInt(args[i + 1]); |
| if (val < 0 || val > 1) { |
| throw new RuntimeException("passIterationToDecoder should be either 0 or 1"); |
| } |
| passIterationToDecoder = (val == 1); |
| break; |
| case "-decOut": |
| decoderOutFileName = args[i + 1]; |
| break; |
| case "-decExit": |
| validDecoderExitValue = Integer.parseInt(args[i + 1]); |
| break; |
| case "-dcfg": |
| decoderConfigFileName = args[i + 1]; |
| break; |
| case "-N": |
| sizeOfNBest = Integer.parseInt(args[i + 1]); |
| if (sizeOfNBest < 1) { |
| throw new RuntimeException("N must be positive."); |
| } |
| break; |
| // Output specs |
| case "-v": |
| verbosity = Integer.parseInt(args[i + 1]); |
| if (verbosity < 0 || verbosity > 4) { |
| throw new RuntimeException("verbosity should be between 0 and 4"); |
| } |
| break; |
| case "-decV": |
| decVerbosity = Integer.parseInt(args[i + 1]); |
| if (decVerbosity < 0 || decVerbosity > 1) { |
| throw new RuntimeException("decVerbosity should be either 0 or 1"); |
| } |
| break; |
| case "-fake": |
| fakeFileNameTemplate = args[i + 1]; |
| int QM_i = fakeFileNameTemplate.indexOf("?"); |
| if (QM_i <= 0) { |
| throw new RuntimeException( |
| "fakeFileNameTemplate must contain '?' to indicate position of iteration number"); |
| } |
| fakeFileNamePrefix = fakeFileNameTemplate.substring(0, QM_i); |
| fakeFileNameSuffix = fakeFileNameTemplate.substring(QM_i + 1); |
| break; |
| case "-damianos": |
| damianos_method = Integer.parseInt(args[i + 1]); |
| if (damianos_method < 0 || damianos_method > 3) { |
| throw new RuntimeException("damianos_method should be between 0 and 3"); |
| } |
| damianos_param = Double.parseDouble(args[i + 2]); |
| damianos_mult = Double.parseDouble(args[i + 3]); |
| i += 2; |
| break; |
| default: |
| throw new RuntimeException("Unknown option " + option); |
| } |
| |
| i += 2; |
| |
| } // while (i) |
| |
| if (maxMERTIterations < minMERTIterations) { |
| |
| if (firstTime) |
| println("Warning: maxMERTIts is smaller than minMERTIts; " + "decreasing minMERTIts from " |
| + minMERTIterations + " to maxMERTIts " + "(i.e. " + maxMERTIterations + ").", 1); |
| |
| minMERTIterations = maxMERTIterations; |
| } |
| |
| if (dirPrefix != null) { // append dirPrefix to file names |
| refFileName = fullPath(dirPrefix, refFileName); |
| decoderOutFileName = fullPath(dirPrefix, decoderOutFileName); |
| paramsFileName = fullPath(dirPrefix, paramsFileName); |
| decoderConfigFileName = fullPath(dirPrefix, decoderConfigFileName); |
| |
| if (sourceFileName != null) { |
| sourceFileName = fullPath(dirPrefix, sourceFileName); |
| } |
| if (docInfoFileName != null) { |
| docInfoFileName = fullPath(dirPrefix, docInfoFileName); |
| } |
| if (finalLambdaFileName != null) { |
| finalLambdaFileName = fullPath(dirPrefix, finalLambdaFileName); |
| } |
| if (decoderCommandFileName != null) { |
| decoderCommandFileName = fullPath(dirPrefix, decoderCommandFileName); |
| } |
| if (fakeFileNamePrefix != null) { |
| fakeFileNamePrefix = fullPath(dirPrefix, fakeFileNamePrefix); |
| } |
| } |
| |
| // TODO: make this an argument |
| // TODO: also use this for the state file? could be tricky, since that file is created by |
| // ZMERT.java |
| // TODO: change name from tmpDirPrefix to tmpFilePrefix? |
| int k = decoderOutFileName.lastIndexOf("/"); |
| if (k >= 0) { |
| tmpDirPrefix = decoderOutFileName.substring(0, k + 1) + "ZMERT."; |
| } else { |
| tmpDirPrefix = "ZMERT."; |
| } |
| println("tmpDirPrefix: " + tmpDirPrefix); |
| |
| checkFile(paramsFileName); |
| checkFile(decoderConfigFileName); |
| |
| boolean canRunCommand = fileExists(decoderCommandFileName); |
| if (decoderCommandFileName != null && !canRunCommand) { |
| // i.e. a decoder command file was specified, but it was not found |
| if (firstTime) |
| println("Warning: specified decoder command file " + decoderCommandFileName |
| + " was not found.", 1); |
| } |
| boolean canRunJoshua = fileExists(sourceFileName); |
| if (sourceFileName != null && !canRunJoshua) { |
| // i.e. a source file was specified, but it was not found |
| if (firstTime) |
| println("Warning: specified source file " + sourceFileName + " was not found.", 1); |
| } |
| boolean canRunFake = (fakeFileNameTemplate != null); |
| |
| if (!canRunCommand && !canRunJoshua) { // can only run fake decoder |
| |
| if (!canRunFake) { |
| String msg = "Z-MERT cannot decode; must provide one of: command file " |
| + "(for external decoder) source file (for Joshua decoder)," |
| + " or prefix for existing output files (for fake decoder)."; |
| throw new RuntimeException(msg); |
| } |
| |
| int lastGoodIt = 0; |
| for (int it = 1; it <= maxMERTIterations; ++it) { |
| if (fileExists(fakeFileNamePrefix + it + fakeFileNameSuffix)) { |
| lastGoodIt = it; |
| } else { |
| break; // from for (it) loop |
| } |
| } |
| |
| if (lastGoodIt == 0) { |
| throw new RuntimeException("Fake decoder cannot find first output file " |
| + (fakeFileNamePrefix + 1 + fakeFileNameSuffix)); |
| } else if (lastGoodIt < maxMERTIterations) { |
| if (firstTime) |
| println("Warning: can only run fake decoder; existing output files " |
| + "are only available for the first " + lastGoodIt + " iteration(s).", 1); |
| } |
| |
| } |
| |
| if (firstTime) { |
| println("Processed the following args array:", 1); |
| print(" ", 1); |
| for (i = 0; i < args.length; ++i) { |
| print(args[i] + " ", 1); |
| } |
| println("", 1); |
| println("", 1); |
| } |
| |
| } // processArgs(String[] args) |
| |
| private void set_docSubsetInfo(int[] info) { |
| |
| /* |
| * 1: -docSet bottom 8d 2: -docSet bottom 25% the bottom ceil(0.20*numDocs) documents 3: -docSet |
| * top 8d 4: -docSet top 25% the top ceil(0.20*numDocs) documents |
| * |
| * 5: -docSet window 11d around 90percentile 11 docs centered around 80th percentile (complain |
| * if not enough docs; don't adjust) 6: -docSet window 11d around 40rank 11 docs centered around |
| * doc ranked 50 (complain if not enough docs; don't adjust) |
| * |
| * |
| * [0]: method (0-6) [1]: first (1-indexed) [2]: last (1-indexed) [3]: size [4]: center [5]: |
| * arg1 (-1 for method 0) [6]: arg2 (-1 for methods 0-4) |
| */ |
| if (info[0] == 0) { // all |
| info[1] = 1; |
| info[2] = numDocuments; |
| info[3] = numDocuments; |
| info[4] = (info[1] + info[2]) / 2; |
| } |
| if (info[0] == 1) { // bottom d |
| info[3] = info[5]; |
| info[2] = numDocuments; |
| info[1] = numDocuments - info[3] + 1; |
| info[4] = (info[1] + info[2]) / 2; |
| } |
| if (info[0] == 2) { // bottom p |
| info[3] = (int) (Math.ceil((info[5] / 100.0) * numDocuments)); |
| info[2] = numDocuments; |
| info[1] = numDocuments - info[3] + 1; |
| info[4] = (info[1] + info[2]) / 2; |
| } |
| if (info[0] == 3) { // top d |
| info[3] = info[5]; |
| info[1] = 1; |
| info[2] = info[3]; |
| info[4] = (info[1] + info[2]) / 2; |
| } |
| if (info[0] == 4) { // top p |
| info[3] = (int) (Math.ceil((info[5] / 100.0) * numDocuments)); |
| info[1] = 1; |
| info[2] = info[3]; |
| info[4] = (info[1] + info[2]) / 2; |
| } |
| if (info[0] == 5) { // window around percentile |
| info[3] = info[5]; |
| info[4] = (int) (Math.floor((info[6] / 100.0) * numDocuments)); |
| info[1] = info[4] - ((info[3] - 1) / 2); |
| info[2] = info[4] + ((info[3] - 1) / 2); |
| } |
| if (info[0] == 6) { // window around rank |
| info[3] = info[5]; |
| info[4] = info[6]; |
| info[1] = info[4] - ((info[3] - 1) / 2); |
| info[2] = info[4] + ((info[3] - 1) / 2); |
| } |
| } |
| |
| private void checkFile(String fileName) { |
| if (!fileExists(fileName)) { |
| throw new RuntimeException("The file " + fileName + " was not found!"); |
| } |
| } |
| |
| private boolean fileExists(String fileName) { |
| if (fileName == null) return false; |
| File checker = new File(fileName); |
| return checker.exists(); |
| } |
| |
| private void gzipFile(String inputFileName) { |
| gzipFile(inputFileName, inputFileName + ".gz"); |
| } |
| |
| private void gzipFile(String inputFileName, String gzippedFileName) { |
| // NOTE: this will delete the original file |
| |
| try { |
| FileInputStream in = new FileInputStream(inputFileName); |
| GZIPOutputStream out = new GZIPOutputStream(new FileOutputStream(gzippedFileName)); |
| |
| byte[] buffer = new byte[4096]; |
| int len; |
| while ((len = in.read(buffer)) > 0) { |
| out.write(buffer, 0, len); |
| } |
| |
| in.close(); |
| out.finish(); |
| out.close(); |
| |
| deleteFile(inputFileName); |
| |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| |
| private String normalize(String str, int normMethod) { |
| if (normMethod == 0) return str; |
| |
| // replace HTML/SGML |
| str = str.replace(""", "\""); |
| str = str.replace("&", "&"); |
| str = str.replace("<", "<"); |
| str = str.replace(">", ">"); |
| str = str.replace("'", "'"); |
| |
| |
| |
| // split on these characters: |
| // ! " # $ % & ( ) * + / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ |
| // i.e. ASCII 33-126, except alphanumeric, and except "," "-" "." "'" |
| |
| // ! "# $%& ( ) * +/:;<=> ?@ [ \ ] ^_` { | }~ |
| String split_on = "!\"#\\$%&\\(\\)\\*\\+/:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}~"; |
| |
| // println("split_on: " + split_on); |
| |
| for (int k = 0; k < split_on.length(); ++k) { |
| // for each split character, reprocess the string |
| String regex = "" + split_on.charAt(k); |
| if (regex.equals("\\")) { |
| ++k; |
| regex += split_on.charAt(k); |
| } |
| str = str.replaceAll(regex, " " + regex + " "); |
| } |
| |
| |
| |
| // split on "." and "," and "-", conditioned on proper context |
| |
| str = " " + str + " "; |
| str = str.replaceAll("\\s+", " "); |
| |
| TreeSet<Integer> splitIndices = new TreeSet<>(); |
| |
| for (int i = 0; i < str.length(); ++i) { |
| char ch = str.charAt(i); |
| if (ch == '.' || ch == ',') { |
| // split if either of the previous or next characters is a non-digit |
| char prev_ch = str.charAt(i - 1); |
| char next_ch = str.charAt(i + 1); |
| if (prev_ch < '0' || prev_ch > '9' || next_ch < '0' || next_ch > '9') { |
| splitIndices.add(i); |
| } |
| } else if (ch == '-') { |
| // split if preceded by a digit |
| char prev_ch = str.charAt(i - 1); |
| if (prev_ch >= '0' && prev_ch <= '9') { |
| splitIndices.add(i); |
| } |
| } |
| } |
| |
| String str0 = str; |
| str = ""; |
| |
| for (int i = 0; i < str0.length(); ++i) { |
| if (splitIndices.contains(i)) { |
| str += " " + str0.charAt(i) + " "; |
| } else { |
| str += str0.charAt(i); |
| } |
| } |
| |
| |
| |
| // rejoin i'm, we're, *'s, won't, don't, etc |
| |
| str = " " + str + " "; |
| str = str.replaceAll("\\s+", " "); |
| |
| str = str.replaceAll(" i 'm ", " i'm "); |
| str = str.replaceAll(" we 're ", " we're "); |
| str = str.replaceAll(" 's ", "'s "); |
| str = str.replaceAll(" 've ", "'ve "); |
| str = str.replaceAll(" 'll ", "'ll "); |
| str = str.replaceAll(" 'd ", "'d "); |
| str = str.replaceAll(" n't ", "n't "); |
| |
| |
| |
| // remove spaces around dashes |
| if (normMethod == 2 || normMethod == 4) { |
| |
| TreeSet<Integer> skipIndices = new TreeSet<>(); |
| str = " " + str + " "; |
| |
| for (int i = 0; i < str.length(); ++i) { |
| char ch = str.charAt(i); |
| if (ch == '-') { |
| // rejoin if surrounded by spaces, and then letters |
| if (str.charAt(i - 1) == ' ' && str.charAt(i + 1) == ' ') { |
| if (Character.isLetter(str.charAt(i - 2)) && Character.isLetter(str.charAt(i + 2))) { |
| skipIndices.add(i - 1); |
| skipIndices.add(i + 1); |
| } |
| } |
| } |
| } |
| |
| str0 = str; |
| str = ""; |
| |
| for (int i = 0; i < str0.length(); ++i) { |
| if (!skipIndices.contains(i)) { |
| str += str0.charAt(i); |
| } |
| } |
| } |
| |
| |
| |
| // drop non-ASCII characters |
| if (normMethod == 3 || normMethod == 4) { |
| |
| str0 = str; |
| str = ""; |
| |
| for (int i = 0; i < str0.length(); ++i) { |
| char ch = str0.charAt(i); |
| if (ch <= 127) { // i.e. if ASCII |
| str += ch; |
| } |
| } |
| } |
| |
| |
| |
| str = str.replaceAll("\\s+", " "); |
| |
| str = str.trim(); |
| |
| return str; |
| } |
| |
| private String fullPath(String dir, String fileName) { |
| File dummyFile = new File(dir, fileName); |
| return dummyFile.getAbsolutePath(); |
| } |
| |
| private void println(Object obj, int priority) { |
| if (priority <= verbosity) println(obj); |
| } |
| |
| private void print(Object obj, int priority) { |
| if (priority <= verbosity) print(obj); |
| } |
| |
| private void println(Object obj) { |
| System.out.println(obj); |
| } |
| |
| private void print(Object obj) { |
| System.out.print(obj); |
| } |
| |
| private double[] randomLambda() { |
| double[] retLambda = new double[1 + numParams]; |
| |
| for (int c = 1; c <= numParams; ++c) { |
| if (isOptimizable[c]) { |
| double randVal = randGen.nextDouble(); // number in [0.0,1.0] |
| ++generatedRands; |
| randVal = randVal * (maxRandValue[c] - minRandValue[c]); // number in [0.0,max-min] |
| randVal = minRandValue[c] + randVal; // number in [min,max] |
| retLambda[c] = randVal; |
| } else { |
| retLambda[c] = defaultLambda[c]; |
| } |
| } |
| |
| return retLambda; |
| } |
| |
| private double[] randomPerturbation(double[] origLambda, int i, double method, double param, |
| double mult) { |
| double sigma = 0.0; |
| if (method == 1) { |
| sigma = 1.0 / Math.pow(i, param); |
| } else if (method == 2) { |
| sigma = Math.exp(-param * i); |
| } else if (method == 3) { |
| sigma = Math.max(0.0, 1.0 - (i / param)); |
| } |
| |
| sigma = mult * sigma; |
| |
| double[] retLambda = new double[1 + numParams]; |
| |
| for (int c = 1; c <= numParams; ++c) { |
| if (isOptimizable[c]) { |
| double randVal = 2 * randGen.nextDouble() - 1.0; // number in [-1.0,1.0] |
| ++generatedRands; |
| randVal = randVal * sigma; // number in [-sigma,sigma] |
| randVal = randVal * origLambda[c]; // number in [-sigma*orig[c],sigma*orig[c]] |
| randVal = randVal + origLambda[c]; // number in |
| // [orig[c]-sigma*orig[c],orig[c]+sigma*orig[c]] |
| // = [orig[c]*(1-sigma),orig[c]*(1+sigma)] |
| retLambda[c] = randVal; |
| } else { |
| retLambda[c] = origLambda[c]; |
| } |
| } |
| |
| return retLambda; |
| } |
| |
| private int c_fromParamName(String pName) { |
| for (int c = 1; c <= numParams; ++c) { |
| if (paramNames[c].equals(pName)) return c; |
| } |
| return 0; // no parameter with that name! |
| } |
| |
| private void setFeats(double[][][] featVal_array, int i, int[] lastUsedIndex, int[] maxIndex, |
| double[] featVal) { |
| int k = lastUsedIndex[i] + 1; |
| |
| if (k > maxIndex[i]) { |
| for (int c = 1; c <= numParams; ++c) { |
| double[] temp = featVal_array[c][i]; |
| featVal_array[c][i] = new double[1 + maxIndex[i] + sizeOfNBest]; |
| |
| System.arraycopy(temp, 0, featVal_array[c][i], 0, maxIndex[i] + 1); |
| } |
| maxIndex[i] += sizeOfNBest; |
| // cleanupMemorySilently(); // UNCOMMENT THIS if cleaning up memory |
| } |
| |
| for (int c = 1; c <= numParams; ++c) { |
| featVal_array[c][i][k] = featVal[c]; |
| } |
| lastUsedIndex[i] += 1; |
| } |
| |
| public static void main(String[] args) throws FileNotFoundException, IOException { |
| |
| String configFileName = args[0]; |
| String stateFileName = args[1]; |
| int currIteration = Integer.parseInt(args[2]); |
| JoshuaConfiguration joshuaConfiguration = new JoshuaConfiguration(); |
| |
| MertCore DMC = new MertCore(joshuaConfiguration); // dummy MertCore object |
| |
| // if bad args[], System.exit(80) |
| |
| |
| |
| |
| int randsToSkip = 0; |
| int earlyStop = 0; |
| double FINAL_score = 0.0; |
| int[] maxIndex = null; |
| |
| if (currIteration == 1) { |
| EvaluationMetric.set_knownMetrics(); |
| DMC.processArgsArray(DMC.cfgFileToArgsArray(configFileName), true); |
| |
| randsToSkip = 0; |
| DMC.initialize(randsToSkip); |
| |
| DMC.println("----------------------------------------------------", 1); |
| DMC.println("Z-MERT run started @ " + (new Date()), 1); |
| // DMC.printMemoryUsage(); |
| DMC.println("----------------------------------------------------", 1); |
| DMC.println("", 1); |
| |
| if (DMC.randInit) { |
| DMC.println("Initializing lambda[] randomly.", 1); |
| |
| // initialize optimizable parameters randomly (sampling uniformly from |
| // that parameter's random value range) |
| DMC.lambda = DMC.randomLambda(); |
| } |
| |
| DMC.println("Initial lambda[]: " + DMC.lambdaToString(DMC.lambda), 1); |
| DMC.println("", 1); |
| |
| FINAL_score = DMC.evalMetric.worstPossibleScore(); |
| maxIndex = new int[DMC.numSentences]; |
| for (int i = 0; i < DMC.numSentences; ++i) { |
| maxIndex[i] = DMC.sizeOfNBest - 1; |
| } |
| earlyStop = 0; |
| } else { |
| |
| EvaluationMetric.set_knownMetrics(); |
| DMC.processArgsArray(DMC.cfgFileToArgsArray(configFileName), false); |
| |
| double[] serA = null; |
| try { |
| ObjectInputStream in = new ObjectInputStream(new FileInputStream(stateFileName)); |
| serA = (double[]) in.readObject(); |
| in.close(); |
| // contents of serA[]: |
| // (*) last iteration |
| // (*) number of random numbers generated already |
| // (*) earlyStop |
| // (*) FINAL_score |
| // (*) lambda[] |
| // (*) maxIndex[] |
| // => length should be 4+numParams+numSentences |
| } catch (FileNotFoundException e) { |
| System.err.println("FileNotFoundException in MertCore.main(String[]): " + e.getMessage()); |
| System.exit(99901); |
| } catch (IOException e) { |
| System.err.println("IOException in MertCore.main(String[]): " + e.getMessage()); |
| System.exit(99902); |
| } catch (ClassNotFoundException e) { |
| System.err.println("ClassNotFoundException in MertCore.main(String[]): " + e.getMessage()); |
| System.exit(99904); |
| } |
| |
| if (serA.length < 2) { |
| DMC.println("State file contains an array of length " + serA.length + "; " |
| + "was expecting at least 2"); |
| System.exit(81); |
| } |
| |
| if ((int) serA[0] != currIteration - 1) { |
| DMC.println("Iteration in state file is " + (int) serA[0] + "; " + "was expecting " |
| + (currIteration - 1)); |
| System.exit(82); |
| } |
| |
| randsToSkip = (int) serA[1]; |
| DMC.initialize(randsToSkip); // declares lambda[], sets numParams and numSentences |
| |
| if (serA.length != 4 + DMC.numParams + DMC.numSentences) { |
| DMC.println("State file contains an array of length " + serA.length + "; " |
| + "was expecting " + (4 + DMC.numParams + DMC.numSentences)); |
| System.exit(83); |
| } |
| |
| earlyStop = (int) serA[2]; |
| FINAL_score = serA[3]; |
| |
| System.arraycopy(serA, 4, DMC.lambda, 1, DMC.numParams); |
| |
| maxIndex = new int[DMC.numSentences]; |
| for (int i = 0; i < DMC.numSentences; ++i) { |
| maxIndex[i] = (int) serA[3 + DMC.numParams + 1 + i]; |
| } |
| } |
| |
| |
| double[] A = |
| DMC.run_single_iteration(currIteration, DMC.minMERTIterations, DMC.maxMERTIterations, |
| DMC.prevMERTIterations, earlyStop, maxIndex); |
| |
| if (A != null) { |
| FINAL_score = A[0]; |
| earlyStop = (int) A[1]; |
| randsToSkip = DMC.generatedRands; |
| } |
| |
| |
| if (A != null && A[2] != 1) { |
| |
| double[] serA = new double[4 + DMC.numParams + DMC.numSentences]; |
| serA[0] = currIteration; |
| serA[1] = randsToSkip; |
| serA[2] = earlyStop; |
| serA[3] = FINAL_score; |
| System.arraycopy(DMC.lambda, 1, serA, 4, DMC.numParams); |
| for (int i = 0; i < DMC.numSentences; ++i) { |
| serA[3 + DMC.numParams + 1 + i] = maxIndex[i]; |
| } |
| |
| try { |
| ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(stateFileName)); |
| out.writeObject(serA); |
| out.flush(); |
| out.close(); |
| } catch (FileNotFoundException e) { |
| System.err.println("FileNotFoundException in MertCore.main(String[]): " + e.getMessage()); |
| System.exit(99901); |
| } catch (IOException e) { |
| System.err.println("IOException in MertCore.main(String[]): " + e.getMessage()); |
| System.exit(99902); |
| } |
| |
| System.exit(91); |
| |
| } else { |
| // done |
| |
| DMC.println("", 1); |
| |
| DMC.println("----------------------------------------------------", 1); |
| DMC.println("Z-MERT run ended @ " + (new Date()), 1); |
| // DMC.printMemoryUsage(); |
| DMC.println("----------------------------------------------------", 1); |
| DMC.println("", 1); |
| DMC.println("FINAL lambda: " + DMC.lambdaToString(DMC.lambda) + " (" + DMC.metricName_display |
| + ": " + FINAL_score + ")", 1); |
| // check if a lambda is outside its threshold range |
| for (int c = 1; c <= DMC.numParams; ++c) { |
| if (DMC.lambda[c] < DMC.minThValue[c] || DMC.lambda[c] > DMC.maxThValue[c]) { |
| DMC.println("Warning: after normalization, lambda[" + c + "]=" + f4.format(DMC.lambda[c]) |
| + " is outside its critical value range.", 1); |
| } |
| } |
| DMC.println("", 1); |
| |
| // delete intermediate .temp.*.it* decoder output files |
| for (int iteration = 1; iteration <= DMC.maxMERTIterations; ++iteration) { |
| if (DMC.compressFiles == 1) { |
| DMC.deleteFile(DMC.tmpDirPrefix + "temp.sents.it" + iteration + ".gz"); |
| DMC.deleteFile(DMC.tmpDirPrefix + "temp.feats.it" + iteration + ".gz"); |
| if (DMC.fileExists(DMC.tmpDirPrefix + "temp.stats.it" + iteration + ".copy.gz")) { |
| DMC.deleteFile(DMC.tmpDirPrefix + "temp.stats.it" + iteration + ".copy.gz"); |
| } else { |
| DMC.deleteFile(DMC.tmpDirPrefix + "temp.stats.it" + iteration + ".gz"); |
| } |
| } else { |
| DMC.deleteFile(DMC.tmpDirPrefix + "temp.sents.it" + iteration); |
| DMC.deleteFile(DMC.tmpDirPrefix + "temp.feats.it" + iteration); |
| if (DMC.fileExists(DMC.tmpDirPrefix + "temp.stats.it" + iteration + ".copy")) { |
| DMC.deleteFile(DMC.tmpDirPrefix + "temp.stats.it" + iteration + ".copy"); |
| } else { |
| DMC.deleteFile(DMC.tmpDirPrefix + "temp.stats.it" + iteration); |
| } |
| } |
| } |
| |
| |
| DMC.finish(); |
| |
| DMC.deleteFile(stateFileName); |
| System.exit(90); |
| } |
| |
| } |
| |
| } |
| |
| |
| /* |
| * |
| * fake: ----- ex2_N300: java -javaagent:shiftone-jrat.jar -Xmx300m -cp bin joshua.ZMERT.ZMERT -dir |
| * MERT_example -s src.txt -r ref.all -rps 4 -cmd decoder_command_ex2.txt -dcfg config_ex2.txt |
| * -decOut nbest_ex2.out -N 300 -p params.txt -maxIt 25 -opi 0 -ipi 20 -v 2 -rand 0 -seed |
| * 1226091488390 -save 1 -fake nbest_ex2.out.N300.it > |
| * ex2_N300ipi20opi0_300max+defratios.it10.noMemRep.bugFixes.monitored.txt |
| * |
| * ex2_N500: java -javaagent:shiftone-jrat.jar -Xmx300m -cp bin joshua.ZMERT.ZMERT -dir MERT_example |
| * -s src.txt -r ref.all -rps 4 -cmd decoder_command_ex2.txt -dcfg config_ex2.txt -decOut |
| * nbest_ex2.out -N 500 -p params.txt -maxIt 25 -opi 0 -ipi 20 -v 2 -rand 0 -seed 1226091488390 |
| * -save 1 -fake nbest_ex2.out.N500.it > |
| * ex2_N500ipi20opi0_300max+defratios.it05.noMemRep.bugFixes.monitored.txt |
| * |
| * exL_N300__600max: java -javaagent:shiftone-jrat.jar -Xmx600m -cp bin joshua.ZMERT.ZMERT -dir |
| * MERT_example -s mt06_source.txt -r mt06_ref.all -rps 4 -cmd decoder_command_ex2.txt -dcfg |
| * config_ex2.txt -decOut nbest_exL.out -N 300 -p params.txt -maxIt 5 -opi 0 -ipi 20 -v 2 -rand 0 |
| * -seed 1226091488390 -save 1 -fake nbest_exL.out.it > |
| * exL_N300ipi20opi0_600max+defratios.it05.noMemRep.bugFixes.monitored.txt |
| * |
| * exL_N300__300max: java -javaagent:shiftone-jrat.jar -Xmx300m -cp bin joshua.ZMERT.ZMERT -dir |
| * MERT_example -s mt06_source.txt -r mt06_ref.all -rps 4 -cmd decoder_command_ex2.txt -dcfg |
| * config_ex2.txt -decOut nbest_exL.out -N 300 -p params.txt -maxIt 5 -opi 0 -ipi 20 -v 2 -rand 0 |
| * -seed 1226091488390 -save 1 -fake nbest_exL.out.it > |
| * exL_N300ipi20opi0_300max+defratios.it05.noMemRep.bugFixes.monitored.txt |
| * |
| * gen: ---- ex2_N300: make sure top_n=300 in MERT_example\config_ex2.txt java |
| * -javaagent:shiftone-jrat.jar -Xmx300m -cp bin joshua.ZMERT.ZMERT -dir MERT_example -s src.txt -r |
| * ref.all -rps 4 -cmd decoder_command_ex2.txt -dcfg config_ex2.txt -decOut nbest_ex2.out -N 300 -p |
| * params.txt -maxIt 25 -opi 0 -ipi 20 -v 2 -rand 0 -seed 1226091488390 -save 1 > |
| * ex2_N300ipi20opi0_300max+defratios.itxx.monitored.txt.gen |
| * |
| * ex2_N500: make sure top_n=500 in MERT_example\config_ex2.txt java -javaagent:shiftone-jrat.jar |
| * -Xmx300m -cp bin joshua.ZMERT.ZMERT -dir MERT_example -s src.txt -r ref.all -rps 4 -cmd |
| * decoder_command_ex2.txt -dcfg config_ex2.txt -decOut nbest_ex2.out -N 500 -p params.txt -maxIt 25 |
| * -opi 0 -ipi 20 -v 2 -rand 0 -seed 1226091488390 -save 1 > |
| * ex2_N500ipi20opi0_300max+defratios.itxx.monitored.txt.gen |
| * |
| * exL_N300__600max: run on CLSP machines only! (e.g. z12) $JAVA_bin/java |
| * -javaagent:shiftone-jrat.jar -Xmx600m -cp bin joshua.ZMERT.ZMERT -dir YOURDIR -s mt06_source.txt |
| * -r mt06_ref.all -rps 4 -cmd decoder_command.txt -dcfg config_exL.txt -decOut nbest_exL.out -N 300 |
| * -p params.txt -maxIt 25 -opi 0 -ipi 20 -v 2 -rand 0 -seed 1226091488390 -save 1 > |
| * exL_N300ipi20opi0_600max+defratios.itxx.monitored.txt.gen |
| * |
| * exL_N300__300max: run on CLSP machines only! (e.g. z12) $JAVA_bin/java |
| * -javaagent:shiftone-jrat.jar -Xmx300m -cp bin joshua.ZMERT.ZMERT -dir YOURDIR -s mt06_source.txt |
| * -r mt06_ref.all -rps 4 -cmd decoder_command.txt -dcfg config_exL.txt -decOut nbest_exL.out -N 300 |
| * -p params.txt -maxIt 25 -opi 0 -ipi 20 -v 2 -rand 0 -seed 1226091488390 -save 1 > |
| * exL_N300ipi20opi0_600max+defratios.itxx.monitored.txt.gen |
| */ |