blob: e54853368f1a6ae7590e774ccdaef0868d9b969c [file] [log] [blame]
package joshua.mira;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Random;
import java.util.Scanner;
import java.util.TreeSet;
import java.util.Vector;
import java.util.concurrent.ConcurrentHashMap;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import joshua.decoder.Decoder;
import joshua.decoder.JoshuaConfiguration;
import joshua.metrics.EvaluationMetric;
import joshua.util.StreamGobbler;
import joshua.corpus.Vocabulary;
/**
* This code was originally written by Yuan Cao, who copied the MERT code to produce this file.
*/
public class MIRACore {
private final JoshuaConfiguration joshuaConfiguration;
private TreeSet<Integer>[] indicesOfInterest_all;
private final static DecimalFormat f4 = new DecimalFormat("###0.0000");
private final Runtime myRuntime = Runtime.getRuntime();
private final static double NegInf = (-1.0 / 0.0);
private final static double PosInf = (+1.0 / 0.0);
private final static double epsilon = 1.0 / 1000000;
private int progress;
private int verbosity; // anything of priority <= verbosity will be printed
// (lower value for priority means more important)
private Random randGen;
private int generatedRands;
private int numSentences;
// number of sentences in the dev set
// (aka the "MERT training" set)
private int numDocuments;
// number of documents in the dev set
// this should be 1, unless doing doc-level optimization
private int[] docOfSentence;
// docOfSentence[i] stores which document contains the i'th sentence.
// docOfSentence is 0-indexed, as are the documents (i.e. first doc is indexed 0)
private int[] docSubsetInfo;
// stores information regarding which subset of the documents are evaluated
// [0]: method (0-6)
// [1]: first (1-indexed)
// [2]: last (1-indexed)
// [3]: size
// [4]: center
// [5]: arg1
// [6]: arg2
// [1-6] are 0 for method 0, [6] is 0 for methods 1-4 as well
// only [1] and [2] are needed for optimization. The rest are only needed for an output message.
private int refsPerSen;
// number of reference translations per sentence
private int textNormMethod;
// 0: no normalization, 1: "NIST-style" tokenization, and also rejoin 'm, 're, *'s, 've, 'll, 'd,
// and n't,
// 2: apply 1 and also rejoin dashes between letters, 3: apply 1 and also drop non-ASCII
// characters
// 4: apply 1+2+3
private int numParams;
//total number of firing features
//this number may increase overtime as new n-best lists are decoded
//initially it is equal to the # of params in the parameter config file
private int numParamsOld;
//number of features before observing the new features fired in the current iteration
private double[] normalizationOptions;
// How should a lambda[] vector be normalized (before decoding)?
// nO[0] = 0: no normalization
// nO[0] = 1: scale so that parameter nO[2] has absolute value nO[1]
// nO[0] = 2: scale so that the maximum absolute value is nO[1]
// nO[0] = 3: scale so that the minimum absolute value is nO[1]
// nO[0] = 4: scale so that the L-nO[1] norm equals nO[2]
/* *********************************************************** */
/* NOTE: indexing starts at 1 in the following few arrays: */
/* *********************************************************** */
//private double[] lambda;
private ArrayList<Double> lambda = new ArrayList<Double>();
// the current weight vector. NOTE: indexing starts at 1.
private ArrayList<Double> bestLambda = new ArrayList<Double>();
// the best weight vector across all iterations
private boolean[] isOptimizable;
// isOptimizable[c] = true iff lambda[c] should be optimized
private double[] minRandValue;
private double[] maxRandValue;
// when choosing a random value for the lambda[c] parameter, it will be
// chosen from the [minRandValue[c],maxRandValue[c]] range.
// (*) minRandValue and maxRandValue must be real values, but not -Inf or +Inf
private double[] defaultLambda;
// "default" parameter values; simply the values read in the parameter file
// USED FOR NON-OPTIMIZABLE (FIXED) FEATURES
/* *********************************************************** */
/* *********************************************************** */
private Decoder myDecoder;
// COMMENT OUT if decoder is not Joshua
private String decoderCommand;
// the command that runs the decoder; read from decoderCommandFileName
private int decVerbosity;
// verbosity level for decoder output. If 0, decoder output is ignored.
// If 1, decoder output is printed.
private int validDecoderExitValue;
// return value from running the decoder command that indicates success
private int numOptThreads;
// number of threads to run things in parallel
private int saveInterFiles;
// 0: nothing, 1: only configs, 2: only n-bests, 3: both configs and n-bests
private int compressFiles;
// should MIRA gzip the large files? If 0, no compression takes place.
// If 1, compression is performed on: decoder output files, temp sents files,
// and temp feats files.
private int sizeOfNBest;
// size of N-best list generated by decoder at each iteration
// (aka simply N, but N is a bad variable name)
private long seed;
// seed used to create random number generators
private boolean randInit;
// if true, parameters are initialized randomly. If false, parameters
// are initialized using values from parameter file.
private int maxMERTIterations, minMERTIterations, prevMERTIterations;
// max: maximum number of MERT iterations
// min: minimum number of MERT iterations before an early MERT exit
// prev: number of previous MERT iterations from which to consider candidates (in addition to
// the candidates from the current iteration)
private double stopSigValue;
// early MERT exit if no weight changes by more than stopSigValue
// (but see minMERTIterations above and stopMinIts below)
private int stopMinIts;
// some early stopping criterion must be satisfied in stopMinIts *consecutive* iterations
// before an early exit (but see minMERTIterations above)
private boolean oneModificationPerIteration;
// if true, each MERT iteration performs at most one parameter modification.
// If false, a new MERT iteration starts (i.e. a new N-best list is
// generated) only after the previous iteration reaches a local maximum.
private String metricName;
// name of evaluation metric optimized by MERT
private String metricName_display;
// name of evaluation metric optimized by MERT, possibly with "doc-level " prefixed
private String[] metricOptions;
// options for the evaluation metric (e.g. for BLEU, maxGramLength and effLengthMethod)
private EvaluationMetric evalMetric;
// the evaluation metric used by MERT
private int suffStatsCount;
// number of sufficient statistics for the evaluation metric
private String tmpDirPrefix;
// prefix for the MIRA.temp.* files
private boolean passIterationToDecoder;
// should the iteration number be passed as an argument to decoderCommandFileName?
// used by mira
private boolean needShuffle = true; // shuffle the training sentences or not
private boolean needAvg = true; //average the weihgts or not?
private boolean runPercep = false; //run perceptron instead of mira
private boolean usePseudoBleu = true; //need to use pseudo corpus to compute bleu?
private boolean returnBest = true; //return the best weight during tuning
private boolean needScale = true; //need scaling?
private String trainingMode;
private int oraSelectMode = 1;
private int predSelectMode = 1;
private int miraIter = 1;
private double C = 0.01; //relaxation coefficient
private double R = 0.99; //corpus decay when pseudo corpus is used for bleu computation
//private double sentForScale = 0.15; //percentage of sentences for scale factor estimation
private double scoreRatio = 5.0; //sclale so that model_score/metric_score = scoreratio
private double prevMetricScore = 0; //final metric score of the previous iteration, used only when returnBest = true
private String dirPrefix; // where are all these files located?
private String paramsFileName, docInfoFileName, finalLambdaFileName;
private String sourceFileName, refFileName, decoderOutFileName;
private String decoderConfigFileName, decoderCommandFileName;
private String fakeFileNameTemplate, fakeFileNamePrefix, fakeFileNameSuffix;
// e.g. output.it[1-x].someOldRun would be specified as:
// output.it?.someOldRun
// and we'd have prefix = "output.it" and suffix = ".sameOldRun"
// private int useDisk;
public MIRACore(JoshuaConfiguration joshuaConfiguration) {
this.joshuaConfiguration = joshuaConfiguration;
}
public MIRACore(String[] args, JoshuaConfiguration joshuaConfiguration) {
this.joshuaConfiguration = joshuaConfiguration;
EvaluationMetric.set_knownMetrics();
processArgsArray(args);
initialize(0);
}
public MIRACore(String configFileName, JoshuaConfiguration joshuaConfiguration) {
this.joshuaConfiguration = joshuaConfiguration;
EvaluationMetric.set_knownMetrics();
processArgsArray(cfgFileToArgsArray(configFileName));
initialize(0);
}
private void initialize(int randsToSkip) {
println("NegInf: " + NegInf + ", PosInf: " + PosInf + ", epsilon: " + epsilon, 4);
randGen = new Random(seed);
for (int r = 1; r <= randsToSkip; ++r) {
randGen.nextDouble();
}
generatedRands = randsToSkip;
if (randsToSkip == 0) {
println("----------------------------------------------------", 1);
println("Initializing...", 1);
println("----------------------------------------------------", 1);
println("", 1);
println("Random number generator initialized using seed: " + seed, 1);
println("", 1);
}
// count the total num of sentences to be decoded, reffilename is the combined reference file
// name(auto generated)
numSentences = countLines(refFileName) / refsPerSen;
// ??
processDocInfo();
// sets numDocuments and docOfSentence[]
if (numDocuments > 1)
metricName_display = "doc-level " + metricName;
// ??
set_docSubsetInfo(docSubsetInfo);
// count the number of initial features
numParams = countNonEmptyLines(paramsFileName) - 1;
numParamsOld = numParams;
// read parameter config file
try {
// read dense parameter names
BufferedReader inFile_names = new BufferedReader(new FileReader(paramsFileName));
for (int c = 1; c <= numParams; ++c) {
String line = "";
while (line != null && line.length() == 0) { // skip empty lines
line = inFile_names.readLine();
}
// save feature names
String paramName = (line.substring(0, line.indexOf("|||"))).trim();
Vocabulary.id(paramName);
// System.err.println(String.format("VOCAB(%s) = %d", paramName, id));
}
inFile_names.close();
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MIRACore.initialize(int): " + e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MIRACore.initialize(int): " + e.getMessage());
System.exit(99902);
}
// the parameter file contains one line per parameter
// and one line for the normalization method
// indexing starts at 1 in these arrays
for ( int p = 0; p <= numParams; ++p )
lambda.add(new Double(0));
bestLambda.add(new Double(0));
// why only lambda is a list? because the size of lambda
// may increase over time, but other arrays are specified in
// the param config file, only used for initialization
isOptimizable = new boolean[1 + numParams];
minRandValue = new double[1 + numParams];
maxRandValue = new double[1 + numParams];
defaultLambda = new double[1 + numParams];
normalizationOptions = new double[3];
// read initial param values
processParamFile();
// sets the arrays declared just above
// SentenceInfo.createV(); // uncomment ONLY IF using vocabulary implementation of SentenceInfo
String[][] refSentences = new String[numSentences][refsPerSen];
try {
// read in reference sentences
InputStream inStream_refs = new FileInputStream(new File(refFileName));
BufferedReader inFile_refs = new BufferedReader(new InputStreamReader(inStream_refs, "utf8"));
for (int i = 0; i < numSentences; ++i) {
for (int r = 0; r < refsPerSen; ++r) {
// read the rth reference translation for the ith sentence
refSentences[i][r] = inFile_refs.readLine();
}
}
inFile_refs.close();
// normalize reference sentences
for (int i = 0; i < numSentences; ++i) {
for (int r = 0; r < refsPerSen; ++r) {
// normalize the rth reference translation for the ith sentence
refSentences[i][r] = normalize(refSentences[i][r], textNormMethod);
}
}
// read in decoder command, if any
decoderCommand = null;
if (decoderCommandFileName != null) {
if (fileExists(decoderCommandFileName)) {
BufferedReader inFile_comm = new BufferedReader(new FileReader(decoderCommandFileName));
decoderCommand = inFile_comm.readLine(); // READ IN DECODE COMMAND
inFile_comm.close();
}
}
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MIRACore.initialize(int): " + e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MIRACore.initialize(int): " + e.getMessage());
System.exit(99902);
}
// set static data members for the EvaluationMetric class
EvaluationMetric.set_numSentences(numSentences);
EvaluationMetric.set_numDocuments(numDocuments);
EvaluationMetric.set_refsPerSen(refsPerSen);
EvaluationMetric.set_refSentences(refSentences);
EvaluationMetric.set_tmpDirPrefix(tmpDirPrefix);
evalMetric = EvaluationMetric.getMetric(metricName, metricOptions);
//used only if returnBest = true
prevMetricScore = evalMetric.getToBeMinimized() ? PosInf : NegInf;
// length of sufficient statistics
// for bleu: suffstatscount=8 (2*ngram+2)
suffStatsCount = evalMetric.get_suffStatsCount();
// set static data members for the IntermediateOptimizer class
/*
* IntermediateOptimizer.set_MERTparams(numSentences, numDocuments, docOfSentence,
* docSubsetInfo, numParams, normalizationOptions, isOptimizable
* oneModificationPerIteration, evalMetric, tmpDirPrefix, verbosity);
*/
// print info
if (randsToSkip == 0) { // i.e. first iteration
println("Number of sentences: " + numSentences, 1);
println("Number of documents: " + numDocuments, 1);
println("Optimizing " + metricName_display, 1);
/*
* print("docSubsetInfo: {", 1); for (int f = 0; f < 6; ++f) print(docSubsetInfo[f] + ", ",
* 1); println(docSubsetInfo[6] + "}", 1);
*/
println("Number of initial features: " + numParams, 1);
print("Initial feature names: {", 1);
for (int c = 1; c <= numParams; ++c)
print("\"" + Vocabulary.word(c) + "\"", 1);
println("}", 1);
println("", 1);
// TODO just print the correct info
println("c Default value\tOptimizable?\tRand. val. range", 1);
for (int c = 1; c <= numParams; ++c) {
print(c + " " + f4.format(lambda.get(c).doubleValue()) + "\t\t", 1);
if (!isOptimizable[c]) {
println(" No", 1);
} else {
print(" Yes\t\t", 1);
print(" [" + minRandValue[c] + "," + maxRandValue[c] + "]", 1);
println("", 1);
}
}
println("", 1);
print("Weight vector normalization method: ", 1);
if (normalizationOptions[0] == 0) {
println("none.", 1);
} else if (normalizationOptions[0] == 1) {
println(
"weights will be scaled so that the \""
+ Vocabulary.word((int) normalizationOptions[2])
+ "\" weight has an absolute value of " + normalizationOptions[1] + ".", 1);
} else if (normalizationOptions[0] == 2) {
println("weights will be scaled so that the maximum absolute value is "
+ normalizationOptions[1] + ".", 1);
} else if (normalizationOptions[0] == 3) {
println("weights will be scaled so that the minimum absolute value is "
+ normalizationOptions[1] + ".", 1);
} else if (normalizationOptions[0] == 4) {
println("weights will be scaled so that the L-" + normalizationOptions[1] + " norm is "
+ normalizationOptions[2] + ".", 1);
}
println("", 1);
println("----------------------------------------------------", 1);
println("", 1);
// rename original config file so it doesn't get overwritten
// (original name will be restored in finish())
renameFile(decoderConfigFileName, decoderConfigFileName + ".MIRA.orig");
} // if (randsToSkip == 0)
// by default, load joshua decoder
if (decoderCommand == null && fakeFileNameTemplate == null) {
println("Loading Joshua decoder...", 1);
myDecoder = new Decoder(joshuaConfiguration, decoderConfigFileName + ".MIRA.orig");
println("...finished loading @ " + (new Date()), 1);
println("");
} else {
myDecoder = null;
}
@SuppressWarnings("unchecked")
TreeSet<Integer>[] temp_TSA = new TreeSet[numSentences];
indicesOfInterest_all = temp_TSA;
for (int i = 0; i < numSentences; ++i) {
indicesOfInterest_all[i] = new TreeSet<Integer>();
}
} // void initialize(...)
// -------------------------
public void run_MIRA() {
run_MIRA(minMERTIterations, maxMERTIterations, prevMERTIterations);
}
public void run_MIRA(int minIts, int maxIts, int prevIts) {
// FIRST, CLEAN ALL PREVIOUS TEMP FILES
String dir;
int k = tmpDirPrefix.lastIndexOf("/");
if (k >= 0) {
dir = tmpDirPrefix.substring(0, k + 1);
} else {
dir = "./";
}
String files;
File folder = new File(dir);
if (folder.exists()) {
File[] listOfFiles = folder.listFiles();
for (int i = 0; i < listOfFiles.length; i++) {
if (listOfFiles[i].isFile()) {
files = listOfFiles[i].getName();
if (files.startsWith("MIRA.temp")) {
deleteFile(files);
}
}
}
}
println("----------------------------------------------------", 1);
println("MIRA run started @ " + (new Date()), 1);
// printMemoryUsage();
println("----------------------------------------------------", 1);
println("", 1);
// if no default lambda is provided
if (randInit) {
println("Initializing lambda[] randomly.", 1);
// initialize optimizable parameters randomly (sampling uniformly from
// that parameter's random value range)
lambda = randomLambda();
}
println("Initial lambda[]: " + lambdaToString(lambda), 1);
println("", 1);
int[] maxIndex = new int[numSentences];
// HashMap<Integer,int[]>[] suffStats_array = new HashMap[numSentences];
// suffStats_array[i] maps candidates of interest for sentence i to an array
// storing the sufficient statistics for that candidate
int earlyStop = 0;
// number of consecutive iteration an early stopping criterion was satisfied
for (int iteration = 1;; ++iteration) {
// what does "A" contain?
// retA[0]: FINAL_score
// retA[1]: earlyStop
// retA[2]: should this be the last iteration?
double[] A = run_single_iteration(iteration, minIts, maxIts, prevIts, earlyStop, maxIndex);
if (A != null) {
earlyStop = (int) A[1];
if (A[2] == 1)
break;
} else {
break;
}
} // for (iteration)
println("", 1);
println("----------------------------------------------------", 1);
println("MIRA run ended @ " + (new Date()), 1);
// printMemoryUsage();
println("----------------------------------------------------", 1);
println("", 1);
if ( ! returnBest )
println("FINAL lambda: " + lambdaToString(lambda), 1);
// + " (" + metricName_display + ": " + FINAL_score + ")",1);
else
println("BEST lambda: " + lambdaToString(lambda), 1);
// delete intermediate .temp.*.it* decoder output files
for (int iteration = 1; iteration <= maxIts; ++iteration) {
if (compressFiles == 1) {
deleteFile(tmpDirPrefix + "temp.sents.it" + iteration + ".gz");
deleteFile(tmpDirPrefix + "temp.feats.it" + iteration + ".gz");
if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration + ".copy.gz")) {
deleteFile(tmpDirPrefix + "temp.stats.it" + iteration + ".copy.gz");
} else {
deleteFile(tmpDirPrefix + "temp.stats.it" + iteration + ".gz");
}
} else {
deleteFile(tmpDirPrefix + "temp.sents.it" + iteration);
deleteFile(tmpDirPrefix + "temp.feats.it" + iteration);
if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration + ".copy")) {
deleteFile(tmpDirPrefix + "temp.stats.it" + iteration + ".copy");
} else {
deleteFile(tmpDirPrefix + "temp.stats.it" + iteration);
}
}
}
} // void run_MIRA(int maxIts)
// this is the key function!
@SuppressWarnings("unchecked")
public double[] run_single_iteration(int iteration, int minIts, int maxIts, int prevIts,
int earlyStop, int[] maxIndex) {
double FINAL_score = 0;
double[] retA = new double[3];
// retA[0]: FINAL_score
// retA[1]: earlyStop
// retA[2]: should this be the last iteration?
boolean done = false;
retA[2] = 1; // will only be made 0 if we don't break from the following loop
// save feats and stats for all candidates(old & new)
HashMap<String, String>[] feat_hash = new HashMap[numSentences];
for (int i = 0; i < numSentences; i++)
feat_hash[i] = new HashMap<String, String>();
HashMap<String, String>[] stats_hash = new HashMap[numSentences];
for (int i = 0; i < numSentences; i++)
stats_hash[i] = new HashMap<String, String>();
while (!done) { // NOTE: this "loop" will only be carried out once
println("--- Starting MIRA iteration #" + iteration + " @ " + (new Date()) + " ---", 1);
// printMemoryUsage();
/******************************/
// CREATE DECODER CONFIG FILE //
/******************************/
createConfigFile(lambda, decoderConfigFileName, decoderConfigFileName + ".MIRA.orig");
// i.e. use the original config file as a template
/***************/
// RUN DECODER //
/***************/
if (iteration == 1) {
println("Decoding using initial weight vector " + lambdaToString(lambda), 1);
} else {
println("Redecoding using weight vector " + lambdaToString(lambda), 1);
}
// generate the n-best file after decoding
String[] decRunResult = run_decoder(iteration); // iteration passed in case fake decoder will
// be used
// [0] name of file to be processed
// [1] indicates how the output file was obtained:
// 1: external decoder
// 2: fake decoder
// 3: internal decoder
if (!decRunResult[1].equals("2")) {
println("...finished decoding @ " + (new Date()), 1);
}
checkFile(decRunResult[0]);
/************* END OF DECODING **************/
println("Producing temp files for iteration " + iteration, 3);
produceTempFiles(decRunResult[0], iteration);
// save intermedidate output files
// save joshua.config.mira.it*
if (saveInterFiles == 1 || saveInterFiles == 3) { // make copy of intermediate config file
if (!copyFile(decoderConfigFileName, decoderConfigFileName + ".MIRA.it" + iteration)) {
println("Warning: attempt to make copy of decoder config file (to create"
+ decoderConfigFileName + ".MIRA.it" + iteration + ") was unsuccessful!", 1);
}
}
// save output.nest.MIRA.it*
if (saveInterFiles == 2 || saveInterFiles == 3) { // make copy of intermediate decoder output
// file...
if (!decRunResult[1].equals("2")) { // ...but only if no fake decoder
if (!decRunResult[0].endsWith(".gz")) {
if (!copyFile(decRunResult[0], decRunResult[0] + ".MIRA.it" + iteration)) {
println("Warning: attempt to make copy of decoder output file (to create"
+ decRunResult[0] + ".MIRA.it" + iteration + ") was unsuccessful!", 1);
}
} else {
String prefix = decRunResult[0].substring(0, decRunResult[0].length() - 3);
if (!copyFile(prefix + ".gz", prefix + ".MIRA.it" + iteration + ".gz")) {
println("Warning: attempt to make copy of decoder output file (to create" + prefix
+ ".MIRA.it" + iteration + ".gz" + ") was unsuccessful!", 1);
}
}
if (compressFiles == 1 && !decRunResult[0].endsWith(".gz")) {
gzipFile(decRunResult[0] + ".MIRA.it" + iteration);
}
} // if (!fake)
}
// ------------- end of saving .mira.it* files ---------------
int[] candCount = new int[numSentences];
int[] lastUsedIndex = new int[numSentences];
ConcurrentHashMap<Integer, int[]>[] suffStats_array = new ConcurrentHashMap[numSentences];
for (int i = 0; i < numSentences; ++i) {
candCount[i] = 0;
lastUsedIndex[i] = -1;
// suffStats_array[i].clear();
suffStats_array[i] = new ConcurrentHashMap<Integer, int[]>();
}
// initLambda[0] is not used!
double[] initialLambda = new double[1 + numParams];
for (int i = 1; i <= numParams; ++i)
initialLambda[i] = lambda.get(i);
// the "score" in initialScore refers to that
// assigned by the evaluation metric)
// you may consider all candidates from iter 1, or from iter (iteration-prevIts) to current
// iteration
int firstIt = Math.max(1, iteration - prevIts);
// i.e. only process candidates from the current iteration and candidates
// from up to prevIts previous iterations.
println("Reading candidate translations from iterations " + firstIt + "-" + iteration, 1);
println("(and computing " + metricName
+ " sufficient statistics for previously unseen candidates)", 1);
print(" Progress: ");
int[] newCandidatesAdded = new int[1 + iteration];
for (int it = 1; it <= iteration; ++it)
newCandidatesAdded[it] = 0;
try {
// read temp files from all past iterations
// 3 types of temp files:
// 1. output hypo at iter i
// 2. feature value of each hypo at iter i
// 3. suff stats of each hypo at iter i
// each inFile corresponds to the output of an iteration
// (index 0 is not used; no corresponding index for the current iteration)
BufferedReader[] inFile_sents = new BufferedReader[iteration];
BufferedReader[] inFile_feats = new BufferedReader[iteration];
BufferedReader[] inFile_stats = new BufferedReader[iteration];
// temp file(array) from previous iterations
for (int it = firstIt; it < iteration; ++it) {
InputStream inStream_sents, inStream_feats, inStream_stats;
if (compressFiles == 0) {
inStream_sents = new FileInputStream(tmpDirPrefix + "temp.sents.it" + it);
inStream_feats = new FileInputStream(tmpDirPrefix + "temp.feats.it" + it);
inStream_stats = new FileInputStream(tmpDirPrefix + "temp.stats.it" + it);
} else {
inStream_sents = new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.sents.it"
+ it + ".gz"));
inStream_feats = new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.feats.it"
+ it + ".gz"));
inStream_stats = new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.stats.it"
+ it + ".gz"));
}
inFile_sents[it] = new BufferedReader(new InputStreamReader(inStream_sents, "utf8"));
inFile_feats[it] = new BufferedReader(new InputStreamReader(inStream_feats, "utf8"));
inFile_stats[it] = new BufferedReader(new InputStreamReader(inStream_stats, "utf8"));
}
InputStream inStream_sentsCurrIt, inStream_featsCurrIt, inStream_statsCurrIt;
// temp file for current iteration!
if (compressFiles == 0) {
inStream_sentsCurrIt = new FileInputStream(tmpDirPrefix + "temp.sents.it" + iteration);
inStream_featsCurrIt = new FileInputStream(tmpDirPrefix + "temp.feats.it" + iteration);
} else {
inStream_sentsCurrIt = new GZIPInputStream(new FileInputStream(tmpDirPrefix
+ "temp.sents.it" + iteration + ".gz"));
inStream_featsCurrIt = new GZIPInputStream(new FileInputStream(tmpDirPrefix
+ "temp.feats.it" + iteration + ".gz"));
}
BufferedReader inFile_sentsCurrIt = new BufferedReader(new InputStreamReader(
inStream_sentsCurrIt, "utf8"));
BufferedReader inFile_featsCurrIt = new BufferedReader(new InputStreamReader(
inStream_featsCurrIt, "utf8"));
BufferedReader inFile_statsCurrIt = null; // will only be used if statsCurrIt_exists below
// is set to true
PrintWriter outFile_statsCurrIt = null; // will only be used if statsCurrIt_exists below is
// set to false
// just to check if temp.stat.it.iteration exists
boolean statsCurrIt_exists = false;
if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration)) {
inStream_statsCurrIt = new FileInputStream(tmpDirPrefix + "temp.stats.it" + iteration);
inFile_statsCurrIt = new BufferedReader(new InputStreamReader(inStream_statsCurrIt,
"utf8"));
statsCurrIt_exists = true;
copyFile(tmpDirPrefix + "temp.stats.it" + iteration, tmpDirPrefix + "temp.stats.it"
+ iteration + ".copy");
} else if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration + ".gz")) {
inStream_statsCurrIt = new GZIPInputStream(new FileInputStream(tmpDirPrefix
+ "temp.stats.it" + iteration + ".gz"));
inFile_statsCurrIt = new BufferedReader(new InputStreamReader(inStream_statsCurrIt,
"utf8"));
statsCurrIt_exists = true;
copyFile(tmpDirPrefix + "temp.stats.it" + iteration + ".gz", tmpDirPrefix
+ "temp.stats.it" + iteration + ".copy.gz");
} else {
outFile_statsCurrIt = new PrintWriter(tmpDirPrefix + "temp.stats.it" + iteration);
}
// output the 4^th temp file: *.temp.stats.merged
PrintWriter outFile_statsMerged = new PrintWriter(tmpDirPrefix + "temp.stats.merged");
// write sufficient statistics from all the sentences
// from the output files into a single file
PrintWriter outFile_statsMergedKnown = new PrintWriter(tmpDirPrefix
+ "temp.stats.mergedKnown");
// write sufficient statistics from all the sentences
// from the output files into a single file
// output the 5^th 6^th temp file, but will be deleted at the end of the function
FileOutputStream outStream_unknownCands = new FileOutputStream(tmpDirPrefix
+ "temp.currIt.unknownCands", false);
OutputStreamWriter outStreamWriter_unknownCands = new OutputStreamWriter(
outStream_unknownCands, "utf8");
BufferedWriter outFile_unknownCands = new BufferedWriter(outStreamWriter_unknownCands);
PrintWriter outFile_unknownIndices = new PrintWriter(tmpDirPrefix
+ "temp.currIt.unknownIndices");
String sents_str, feats_str, stats_str;
// BUG: this assumes a candidate string cannot be produced for two
// different source sentences, which is not necessarily true
// (It's not actually a bug, but only because existingCandStats gets
// cleared before moving to the next source sentence.)
// FIX: should be made an array, indexed by i
HashMap<String, String> existingCandStats = new HashMap<String, String>();
// VERY IMPORTANT:
// A CANDIDATE X MAY APPEARED IN ITER 1, ITER 3
// BUT IF THE USER SPECIFIED TO CONSIDER ITERATIONS FROM ONLY ITER 2, THEN
// X IS NOT A "REPEATED" CANDIDATE IN ITER 3. THEREFORE WE WANT TO KEEP THE
// SUFF STATS FOR EACH CANDIDATE(TO SAVE COMPUTATION IN THE FUTURE)
// Stores precalculated sufficient statistics for candidates, in case
// the same candidate is seen again. (SS stored as a String.)
// Q: Why do we care? If we see the same candidate again, aren't we going
// to ignore it? So, why do we care about the SS of this repeat candidate?
// A: A "repeat" candidate may not be a repeat candidate in later
// iterations if the user specifies a value for prevMERTIterations
// that causes MERT to skip candidates from early iterations.
double[] currFeatVal = new double[1 + numParams];
String[] featVal_str;
int totalCandidateCount = 0;
// new candidate size for each sentence
int[] sizeUnknown_currIt = new int[numSentences];
for (int i = 0; i < numSentences; ++i) {
// process candidates from previous iterations
// low efficiency? for each iteration, it reads in all previous iteration outputs
// therefore a lot of overlapping jobs
// this is an easy implementation to deal with the situation in which user only specified
// "previt" and hopes to consider only the previous previt
// iterations, then for each iteration the existing candadites will be different
for (int it = firstIt; it < iteration; ++it) {
// Why up to but *excluding* iteration?
// Because the last iteration is handled a little differently, since
// the SS must be calculated (and the corresponding file created),
// which is not true for previous iterations.
for (int n = 0; n <= sizeOfNBest; ++n) {
// note that in all temp files, "||||||" is a separator between 2 n-best lists
// Why up to and *including* sizeOfNBest?
// So that it would read the "||||||" separator even if there is
// a complete list of sizeOfNBest candidates.
// for the nth candidate for the ith sentence, read the sentence, feature values,
// and sufficient statistics from the various temp files
// read one line of temp.sent, temp.feat, temp.stats from iteration it
sents_str = inFile_sents[it].readLine();
feats_str = inFile_feats[it].readLine();
stats_str = inFile_stats[it].readLine();
if (sents_str.equals("||||||")) {
n = sizeOfNBest + 1; // move on to the next n-best list
} else if (!existingCandStats.containsKey(sents_str)) // if this candidate does not
// exist
{
outFile_statsMergedKnown.println(stats_str);
// save feats & stats
feat_hash[i].put(sents_str, feats_str);
stats_hash[i].put(sents_str, stats_str);
// extract feature value
featVal_str = feats_str.split("\\s+");
if (feats_str.indexOf('=') != -1) {
for (String featurePair : featVal_str) {
String[] pair = featurePair.split("=");
String name = pair[0];
Double value = Double.parseDouble(pair[1]);
}
}
existingCandStats.put(sents_str, stats_str);
candCount[i] += 1;
newCandidatesAdded[it] += 1;
} // if unseen candidate
} // for (n)
} // for (it)
outFile_statsMergedKnown.println("||||||");
// ---------- end of processing previous iterations ----------
// ---------- now start processing new candidates ----------
// now process the candidates of the current iteration
// now determine the new candidates of the current iteration
/*
* remember: BufferedReader inFile_sentsCurrIt BufferedReader inFile_featsCurrIt
* PrintWriter outFile_statsCurrIt
*/
String[] sentsCurrIt_currSrcSent = new String[sizeOfNBest + 1];
Vector<String> unknownCands_V = new Vector<String>();
// which candidates (of the i'th source sentence) have not been seen before
// this iteration?
for (int n = 0; n <= sizeOfNBest; ++n) {
// Why up to and *including* sizeOfNBest?
// So that it would read the "||||||" separator even if there is
// a complete list of sizeOfNBest candidates.
// for the nth candidate for the ith sentence, read the sentence,
// and store it in the sentsCurrIt_currSrcSent array
sents_str = inFile_sentsCurrIt.readLine(); // read one candidate from the current
// iteration
sentsCurrIt_currSrcSent[n] = sents_str; // Note: possibly "||||||"
if (sents_str.equals("||||||")) {
n = sizeOfNBest + 1;
} else if (!existingCandStats.containsKey(sents_str)) {
unknownCands_V.add(sents_str); // NEW CANDIDATE FROM THIS ITERATION
writeLine(sents_str, outFile_unknownCands);
outFile_unknownIndices.println(i); // INDEX OF THE NEW CANDIDATES
newCandidatesAdded[iteration] += 1;
existingCandStats.put(sents_str, "U"); // i.e. unknown
// we add sents_str to avoid duplicate entries in unknownCands_V
}
} // for (n)
// only compute suff stats for new candidates
// now unknownCands_V has the candidates for which we need to calculate
// sufficient statistics (for the i'th source sentence)
int sizeUnknown = unknownCands_V.size();
sizeUnknown_currIt[i] = sizeUnknown;
existingCandStats.clear();
} // for (i) each sentence
// ---------- end of merging candidates stats from previous iterations
// and finding new candidates ------------
/*
* int[][] newSuffStats = null; if (!statsCurrIt_exists && sizeUnknown > 0) { newSuffStats =
* evalMetric.suffStats(unknownCands, indices); }
*/
outFile_statsMergedKnown.close();
outFile_unknownCands.close();
outFile_unknownIndices.close();
// want to re-open all temp files and start from scratch again?
for (int it = firstIt; it < iteration; ++it) // previous iterations temp files
{
inFile_sents[it].close();
inFile_stats[it].close();
InputStream inStream_sents, inStream_stats;
if (compressFiles == 0) {
inStream_sents = new FileInputStream(tmpDirPrefix + "temp.sents.it" + it);
inStream_stats = new FileInputStream(tmpDirPrefix + "temp.stats.it" + it);
} else {
inStream_sents = new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.sents.it"
+ it + ".gz"));
inStream_stats = new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.stats.it"
+ it + ".gz"));
}
inFile_sents[it] = new BufferedReader(new InputStreamReader(inStream_sents, "utf8"));
inFile_stats[it] = new BufferedReader(new InputStreamReader(inStream_stats, "utf8"));
}
inFile_sentsCurrIt.close();
// current iteration temp files
if (compressFiles == 0) {
inStream_sentsCurrIt = new FileInputStream(tmpDirPrefix + "temp.sents.it" + iteration);
} else {
inStream_sentsCurrIt = new GZIPInputStream(new FileInputStream(tmpDirPrefix
+ "temp.sents.it" + iteration + ".gz"));
}
inFile_sentsCurrIt = new BufferedReader(new InputStreamReader(inStream_sentsCurrIt, "utf8"));
// calculate SS for unseen candidates and write them to file
FileInputStream inStream_statsCurrIt_unknown = null;
BufferedReader inFile_statsCurrIt_unknown = null;
if (!statsCurrIt_exists && newCandidatesAdded[iteration] > 0) {
// create the file...
evalMetric.createSuffStatsFile(tmpDirPrefix + "temp.currIt.unknownCands", tmpDirPrefix
+ "temp.currIt.unknownIndices", tmpDirPrefix + "temp.stats.unknown", sizeOfNBest);
// ...and open it
inStream_statsCurrIt_unknown = new FileInputStream(tmpDirPrefix + "temp.stats.unknown");
inFile_statsCurrIt_unknown = new BufferedReader(new InputStreamReader(
inStream_statsCurrIt_unknown, "utf8"));
}
// open mergedKnown file
// newly created by the big loop above
FileInputStream instream_statsMergedKnown = new FileInputStream(tmpDirPrefix
+ "temp.stats.mergedKnown");
BufferedReader inFile_statsMergedKnown = new BufferedReader(new InputStreamReader(
instream_statsMergedKnown, "utf8"));
//num of features before observing new firing features from this iteration
numParamsOld = numParams;
for (int i = 0; i < numSentences; ++i) {
// reprocess candidates from previous iterations
for (int it = firstIt; it < iteration; ++it) {
for (int n = 0; n <= sizeOfNBest; ++n) {
sents_str = inFile_sents[it].readLine();
stats_str = inFile_stats[it].readLine();
if (sents_str.equals("||||||")) {
n = sizeOfNBest + 1;
} else if (!existingCandStats.containsKey(sents_str)) {
existingCandStats.put(sents_str, stats_str);
} // if unseen candidate
} // for (n)
} // for (it)
// copy relevant portion from mergedKnown to the merged file
String line_mergedKnown = inFile_statsMergedKnown.readLine();
while (!line_mergedKnown.equals("||||||")) {
outFile_statsMerged.println(line_mergedKnown);
line_mergedKnown = inFile_statsMergedKnown.readLine();
}
int[] stats = new int[suffStatsCount];
for (int n = 0; n <= sizeOfNBest; ++n) {
sents_str = inFile_sentsCurrIt.readLine();
feats_str = inFile_featsCurrIt.readLine();
if (sents_str.equals("||||||")) {
n = sizeOfNBest + 1;
} else if (!existingCandStats.containsKey(sents_str)) {
if (!statsCurrIt_exists) {
stats_str = inFile_statsCurrIt_unknown.readLine();
String[] temp_stats = stats_str.split("\\s+");
for (int s = 0; s < suffStatsCount; ++s) {
stats[s] = Integer.parseInt(temp_stats[s]);
}
outFile_statsCurrIt.println(stats_str);
} else {
stats_str = inFile_statsCurrIt.readLine();
String[] temp_stats = stats_str.split("\\s+");
for (int s = 0; s < suffStatsCount; ++s) {
stats[s] = Integer.parseInt(temp_stats[s]);
}
}
outFile_statsMerged.println(stats_str);
// save feats & stats
// System.out.println(sents_str+" "+feats_str);
feat_hash[i].put(sents_str, feats_str);
stats_hash[i].put(sents_str, stats_str);
featVal_str = feats_str.split("\\s+");
if (feats_str.indexOf('=') != -1) {
for (String featurePair : featVal_str) {
String[] pair = featurePair.split("=");
String name = pair[0];
Double value = Double.parseDouble(pair[1]);
int featId = Vocabulary.id(name);
//need to identify newly fired feats here
//in this case currFeatVal is not given the value
//of the new feat, since the corresponding weight is
//initialized as zero anyway
if (featId > numParams) {
++numParams;
lambda.add(new Double(0));
}
}
}
existingCandStats.put(sents_str, stats_str);
candCount[i] += 1;
// newCandidatesAdded[iteration] += 1;
// moved to code above detecting new candidates
} else {
if (statsCurrIt_exists)
inFile_statsCurrIt.readLine();
else {
// write SS to outFile_statsCurrIt
stats_str = existingCandStats.get(sents_str);
outFile_statsCurrIt.println(stats_str);
}
}
} // for (n)
// now d = sizeUnknown_currIt[i] - 1
if (statsCurrIt_exists)
inFile_statsCurrIt.readLine();
else
outFile_statsCurrIt.println("||||||");
existingCandStats.clear();
totalCandidateCount += candCount[i];
// output sentence progress
if ((i + 1) % 500 == 0) {
print((i + 1) + "\n" + " ", 1);
} else if ((i + 1) % 100 == 0) {
print("+", 1);
} else if ((i + 1) % 25 == 0) {
print(".", 1);
}
} // for (i)
inFile_statsMergedKnown.close();
outFile_statsMerged.close();
// for testing
/*
* int total_sent = 0; for( int i=0; i<numSentences; i++ ) {
* System.out.println(feat_hash[i].size()+" "+candCount[i]); total_sent +=
* feat_hash[i].size(); feat_hash[i].clear(); }
* System.out.println("----------------total sent: "+total_sent); total_sent = 0; for( int
* i=0; i<numSentences; i++ ) { System.out.println(stats_hash[i].size()+" "+candCount[i]);
* total_sent += stats_hash[i].size(); stats_hash[i].clear(); }
* System.out.println("*****************total sent: "+total_sent);
*/
println("", 1); // finish progress line
for (int it = firstIt; it < iteration; ++it) {
inFile_sents[it].close();
inFile_feats[it].close();
inFile_stats[it].close();
}
inFile_sentsCurrIt.close();
inFile_featsCurrIt.close();
if (statsCurrIt_exists)
inFile_statsCurrIt.close();
else
outFile_statsCurrIt.close();
if (compressFiles == 1 && !statsCurrIt_exists) {
gzipFile(tmpDirPrefix + "temp.stats.it" + iteration);
}
// clear temp files
deleteFile(tmpDirPrefix + "temp.currIt.unknownCands");
deleteFile(tmpDirPrefix + "temp.currIt.unknownIndices");
deleteFile(tmpDirPrefix + "temp.stats.unknown");
deleteFile(tmpDirPrefix + "temp.stats.mergedKnown");
// cleanupMemory();
println("Processed " + totalCandidateCount + " distinct candidates " + "(about "
+ totalCandidateCount / numSentences + " per sentence):", 1);
for (int it = firstIt; it <= iteration; ++it) {
println("newCandidatesAdded[it=" + it + "] = " + newCandidatesAdded[it] + " (about "
+ newCandidatesAdded[it] / numSentences + " per sentence)", 1);
}
println("", 1);
println("Number of features observed so far: " + numParams);
println("", 1);
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MIRACore.run_single_iteration(6): "
+ e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MIRACore.run_single_iteration(6): " + e.getMessage());
System.exit(99902);
}
// n-best list converges
if (newCandidatesAdded[iteration] == 0) {
if (!oneModificationPerIteration) {
println("No new candidates added in this iteration; exiting MIRA.", 1);
println("", 1);
println("--- MIRA iteration #" + iteration + " ending @ " + (new Date()) + " ---", 1);
println("", 1);
deleteFile(tmpDirPrefix + "temp.stats.merged");
if (returnBest) {
//note that bestLambda.size() <= lambda.size()
for ( int p = 1; p < bestLambda.size(); ++p )
lambda.set(p, bestLambda.get(p));
//and set the rest of lambda to be 0
for ( int p = 0; p < lambda.size() - bestLambda.size(); ++p )
lambda.set(p+bestLambda.size(), new Double(0));
}
return null; // this means that the old values should be kept by the caller
} else {
println("Note: No new candidates added in this iteration.", 1);
}
}
/************* start optimization **************/
/*
* for( int v=1; v<initialLambda[1].length; v++ ) System.out.print(initialLambda[1][v]+" ");
* System.exit(0);
*/
Optimizer.sentNum = numSentences; // total number of training sentences
Optimizer.needShuffle = needShuffle;
Optimizer.miraIter = miraIter;
Optimizer.oraSelectMode = oraSelectMode;
Optimizer.predSelectMode = predSelectMode;
Optimizer.runPercep = runPercep;
Optimizer.C = C;
Optimizer.needAvg = needAvg;
//Optimizer.sentForScale = sentForScale;
Optimizer.scoreRatio = scoreRatio;
Optimizer.evalMetric = evalMetric;
Optimizer.normalizationOptions = normalizationOptions;
Optimizer.needScale = needScale;
//if need to use bleu stats history
if( iteration == 1 ) {
if(evalMetric.get_metricName().equals("BLEU") && usePseudoBleu) {
Optimizer.initBleuHistory(numSentences, evalMetric.get_suffStatsCount());
Optimizer.usePseudoBleu = usePseudoBleu;
Optimizer.R = R;
}
if(evalMetric.get_metricName().equals("TER-BLEU") && usePseudoBleu) {
Optimizer.initBleuHistory(numSentences, evalMetric.get_suffStatsCount()-2); //Stats count of TER=2
Optimizer.usePseudoBleu = usePseudoBleu;
Optimizer.R = R;
}
}
Vector<String> output = new Vector<String>();
//note: initialLambda[] has length = numParamsOld
//augmented with new feature weights, initial values are 0
double[] initialLambdaNew = new double[1 + numParams];
System.arraycopy(initialLambda, 1, initialLambdaNew, 1, numParamsOld);
//finalLambda[] has length = numParams (considering new features)
double[] finalLambda = new double[1 + numParams];
Optimizer opt = new Optimizer(output, isOptimizable, initialLambdaNew, feat_hash, stats_hash);
finalLambda = opt.runOptimizer();
if ( returnBest ) {
double metricScore = opt.getMetricScore();
if ( ! evalMetric.getToBeMinimized() ) {
if ( metricScore > prevMetricScore ) {
prevMetricScore = metricScore;
for ( int p = 1; p < bestLambda.size(); ++p )
bestLambda.set(p, finalLambda[p]);
if ( 1 + numParams > bestLambda.size() ) {
for ( int p = bestLambda.size(); p <= numParams; ++p )
bestLambda.add(p, finalLambda[p]);
}
}
} else {
if ( metricScore < prevMetricScore ) {
prevMetricScore = metricScore;
for ( int p = 1; p < bestLambda.size(); ++p )
bestLambda.set(p, finalLambda[p]);
if ( 1 + numParams > bestLambda.size() ) {
for ( int p = bestLambda.size(); p <= numParams; ++p )
bestLambda.add(p, finalLambda[p]);
}
}
}
}
// System.out.println(finalLambda.length);
// for( int i=0; i<finalLambda.length-1; i++ )
// System.out.println(finalLambda[i+1]);
/************* end optimization **************/
for (int i = 0; i < output.size(); i++)
println(output.get(i));
// check if any parameter has been updated
boolean anyParamChanged = false;
boolean anyParamChangedSignificantly = false;
for (int c = 1; c <= numParams; ++c) {
if (finalLambda[c] != lambda.get(c)) {
anyParamChanged = true;
}
if (Math.abs(finalLambda[c] - lambda.get(c)) > stopSigValue) {
anyParamChangedSignificantly = true;
}
}
// System.arraycopy(finalLambda,1,lambda,1,numParams);
println("--- MIRA iteration #" + iteration + " ending @ " + (new Date()) + " ---", 1);
println("", 1);
if (!anyParamChanged) {
println("No parameter value changed in this iteration; exiting MIRA.", 1);
println("", 1);
break; // exit for (iteration) loop preemptively
}
// was an early stopping criterion satisfied?
boolean critSatisfied = false;
if (!anyParamChangedSignificantly && stopSigValue >= 0) {
println("Note: No parameter value changed significantly " + "(i.e. by more than "
+ stopSigValue + ") in this iteration.", 1);
critSatisfied = true;
}
if (critSatisfied) {
++earlyStop;
println("", 1);
} else {
earlyStop = 0;
}
// if min number of iterations executed, investigate if early exit should happen
if (iteration >= minIts && earlyStop >= stopMinIts) {
println("Some early stopping criteria has been observed " + "in " + stopMinIts
+ " consecutive iterations; exiting MIRA.", 1);
println("", 1);
if ( returnBest ) {
for ( int f = 1; f <= numParams; ++f )
lambda.set(f, bestLambda.get(f));
} else {
for ( int f = 1; f <= numParams; ++f )
lambda.set(f, finalLambda[f]);
}
break; // exit for (iteration) loop preemptively
}
// if max number of iterations executed, exit
if (iteration >= maxIts) {
println("Maximum number of MIRA iterations reached; exiting MIRA.", 1);
println("", 1);
if ( returnBest ) {
for ( int f = 1; f <= numParams; ++f )
lambda.set(f, bestLambda.get(f));
} else {
for ( int f = 1; f <= numParams; ++f )
lambda.set(f, finalLambda[f]);
}
break; // exit for (iteration) loop
}
// use the new wt vector to decode the next iteration
// (interpolation with previous wt vector)
double interCoef = 1.0; //no interpolation for now
for (int i = 1; i <= numParams; i++)
lambda.set(i, interCoef * finalLambda[i] + (1 - interCoef) * lambda.get(i).doubleValue());
println("Next iteration will decode with lambda: "
+ lambdaToString(lambda), 1);
println("", 1);
// printMemoryUsage();
for (int i = 0; i < numSentences; ++i) {
suffStats_array[i].clear();
}
// cleanupMemory();
// println("",2);
retA[2] = 0; // i.e. this should NOT be the last iteration
done = true;
} // while (!done) // NOTE: this "loop" will only be carried out once
// delete .temp.stats.merged file, since it is not needed in the next
// iteration (it will be recreated from scratch)
deleteFile(tmpDirPrefix + "temp.stats.merged");
retA[0] = FINAL_score;
retA[1] = earlyStop;
return retA;
} // run_single_iteration
private String lambdaToString(ArrayList<Double> lambdaA) {
String retStr = "{";
int featToPrint = numParams > 15 ? 15 : numParams;
//print at most the first 15 features
retStr += "(listing the first " + featToPrint + " lambdas)";
for (int c = 1; c <= featToPrint - 1; ++c) {
retStr += "" + String.format("%.4f", lambdaA.get(c).doubleValue()) + ", ";
}
retStr += "" + String.format("%.4f", lambdaA.get(numParams).doubleValue()) + "}";
return retStr;
}
private String[] run_decoder(int iteration) {
String[] retSA = new String[2];
// retsa saves the output file name(nbest-file)
// and the decoder type
// [0] name of file to be processed
// [1] indicates how the output file was obtained:
// 1: external decoder
// 2: fake decoder
// 3: internal decoder
// use fake decoder
if (fakeFileNameTemplate != null
&& fileExists(fakeFileNamePrefix + iteration + fakeFileNameSuffix)) {
String fakeFileName = fakeFileNamePrefix + iteration + fakeFileNameSuffix;
println("Not running decoder; using " + fakeFileName + " instead.", 1);
/*
* if (fakeFileName.endsWith(".gz")) { copyFile(fakeFileName,decoderOutFileName+".gz");
* gunzipFile(decoderOutFileName+".gz"); } else { copyFile(fakeFileName,decoderOutFileName); }
*/
retSA[0] = fakeFileName;
retSA[1] = "2";
} else {
println("Running external decoder...", 1);
try {
ArrayList<String> cmd = new ArrayList<String>();
cmd.add(decoderCommandFileName);
if (passIterationToDecoder)
cmd.add(Integer.toString(iteration));
ProcessBuilder pb = new ProcessBuilder(cmd);
// this merges the error and output streams of the subprocess
pb.redirectErrorStream(true);
Process p = pb.start();
// capture the sub-command's output
new StreamGobbler(p.getInputStream(), decVerbosity).start();
int decStatus = p.waitFor();
if (decStatus != validDecoderExitValue) {
println("Call to decoder returned " + decStatus + "; was expecting "
+ validDecoderExitValue + ".");
System.exit(30);
}
} catch (IOException e) {
System.err.println("IOException in MIRACore.run_decoder(int): " + e.getMessage());
System.exit(99902);
} catch (InterruptedException e) {
System.err.println("InterruptedException in MIRACore.run_decoder(int): " + e.getMessage());
System.exit(99903);
}
retSA[0] = decoderOutFileName;
retSA[1] = "1";
}
return retSA;
}
private void produceTempFiles(String nbestFileName, int iteration) {
try {
String sentsFileName = tmpDirPrefix + "temp.sents.it" + iteration;
String featsFileName = tmpDirPrefix + "temp.feats.it" + iteration;
FileOutputStream outStream_sents = new FileOutputStream(sentsFileName, false);
OutputStreamWriter outStreamWriter_sents = new OutputStreamWriter(outStream_sents, "utf8");
BufferedWriter outFile_sents = new BufferedWriter(outStreamWriter_sents);
PrintWriter outFile_feats = new PrintWriter(featsFileName);
InputStream inStream_nbest = null;
if (nbestFileName.endsWith(".gz")) {
inStream_nbest = new GZIPInputStream(new FileInputStream(nbestFileName));
} else {
inStream_nbest = new FileInputStream(nbestFileName);
}
BufferedReader inFile_nbest = new BufferedReader(
new InputStreamReader(inStream_nbest, "utf8"));
String line; // , prevLine;
String candidate_str = "";
String feats_str = "";
int i = 0;
int n = 0;
line = inFile_nbest.readLine();
while (line != null) {
/*
* line format:
*
* i ||| words of candidate translation . ||| feat-1_val feat-2_val ... feat-numParams_val
* .*
*/
// in a well formed file, we'd find the nth candidate for the ith sentence
int read_i = Integer.parseInt((line.substring(0, line.indexOf("|||"))).trim());
if (read_i != i) {
writeLine("||||||", outFile_sents);
outFile_feats.println("||||||");
n = 0;
++i;
}
line = (line.substring(line.indexOf("|||") + 3)).trim(); // get rid of initial text
candidate_str = (line.substring(0, line.indexOf("|||"))).trim();
feats_str = (line.substring(line.indexOf("|||") + 3)).trim();
// get rid of candidate string
int junk_i = feats_str.indexOf("|||");
if (junk_i >= 0) {
feats_str = (feats_str.substring(0, junk_i)).trim();
}
writeLine(normalize(candidate_str, textNormMethod), outFile_sents);
outFile_feats.println(feats_str);
++n;
if (n == sizeOfNBest) {
writeLine("||||||", outFile_sents);
outFile_feats.println("||||||");
n = 0;
++i;
}
line = inFile_nbest.readLine();
}
if (i != numSentences) { // last sentence had too few candidates
writeLine("||||||", outFile_sents);
outFile_feats.println("||||||");
}
inFile_nbest.close();
outFile_sents.close();
outFile_feats.close();
if (compressFiles == 1) {
gzipFile(sentsFileName);
gzipFile(featsFileName);
}
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MIRACore.produceTempFiles(int): "
+ e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MIRACore.produceTempFiles(int): " + e.getMessage());
System.exit(99902);
}
}
private void createConfigFile(ArrayList<Double> params, String cfgFileName, String templateFileName) {
try {
// i.e. create cfgFileName, which is similar to templateFileName, but with
// params[] as parameter values
BufferedReader inFile = new BufferedReader(new FileReader(templateFileName));
PrintWriter outFile = new PrintWriter(cfgFileName);
BufferedReader inFeatDefFile = null;
PrintWriter outFeatDefFile = null;
int origFeatNum = 0; //feat num in the template file
String line = inFile.readLine();
while (line != null) {
int c_match = -1;
for (int c = 1; c <= numParams; ++c) {
if (line.startsWith(Vocabulary.word(c) + " ")) {
c_match = c;
++origFeatNum;
break;
}
}
if (c_match == -1) {
outFile.println(line);
} else {
if ( Math.abs(params.get(c_match).doubleValue()) > 1e-20 )
outFile.println(Vocabulary.word(c_match) + " " + params.get(c_match));
}
line = inFile.readLine();
}
//now append weights of new features
for (int c = origFeatNum+1; c <= numParams; ++c) {
if ( Math.abs(params.get(c).doubleValue()) > 1e-20 )
outFile.println(Vocabulary.word(c) + " " + params.get(c));
}
inFile.close();
outFile.close();
} catch (IOException e) {
System.err.println("IOException in MIRACore.createConfigFile(double[],String,String): "
+ e.getMessage());
System.exit(99902);
}
}
private void processParamFile() {
// process parameter file
Scanner inFile_init = null;
try {
inFile_init = new Scanner(new FileReader(paramsFileName));
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MIRACore.processParamFile(): " + e.getMessage());
System.exit(99901);
}
String dummy = "";
// initialize lambda[] and other related arrays
for (int c = 1; c <= numParams; ++c) {
// skip parameter name
while (!dummy.equals("|||")) {
dummy = inFile_init.next();
}
// read default value
lambda.set(c, inFile_init.nextDouble());
defaultLambda[c] = lambda.get(c).doubleValue();
// read isOptimizable
dummy = inFile_init.next();
if (dummy.equals("Opt")) {
isOptimizable[c] = true;
} else if (dummy.equals("Fix")) {
isOptimizable[c] = false;
} else {
println("Unknown isOptimizable string " + dummy + " (must be either Opt or Fix)");
System.exit(21);
}
if (!isOptimizable[c]) { // skip next two values
dummy = inFile_init.next();
dummy = inFile_init.next();
} else {
// set minRandValue[c] and maxRandValue[c] (range for random values)
dummy = inFile_init.next();
if (dummy.equals("-Inf") || dummy.equals("+Inf")) {
println("minRandValue[" + c + "] cannot be -Inf or +Inf!");
System.exit(21);
} else {
minRandValue[c] = Double.parseDouble(dummy);
}
dummy = inFile_init.next();
if (dummy.equals("-Inf") || dummy.equals("+Inf")) {
println("maxRandValue[" + c + "] cannot be -Inf or +Inf!");
System.exit(21);
} else {
maxRandValue[c] = Double.parseDouble(dummy);
}
// check for illogical values
if (minRandValue[c] > maxRandValue[c]) {
println("minRandValue[" + c + "]=" + minRandValue[c] + " > " + maxRandValue[c]
+ "=maxRandValue[" + c + "]!");
System.exit(21);
}
// check for odd values
if (minRandValue[c] == maxRandValue[c]) {
println("Warning: lambda[" + c + "] has " + "minRandValue = maxRandValue = "
+ minRandValue[c] + ".", 1);
}
} // if (!isOptimizable[c])
/*
* precision[c] = inFile_init.nextDouble(); if (precision[c] < 0) { println("precision[" + c +
* "]=" + precision[c] + " < 0! Must be non-negative."); System.exit(21); }
*/
}
// set normalizationOptions[]
String origLine = "";
while (origLine != null && origLine.length() == 0) {
origLine = inFile_init.nextLine();
}
// How should a lambda[] vector be normalized (before decoding)?
// nO[0] = 0: no normalization
// nO[0] = 1: scale so that parameter nO[2] has absolute value nO[1]
// nO[0] = 2: scale so that the maximum absolute value is nO[1]
// nO[0] = 3: scale so that the minimum absolute value is nO[1]
// nO[0] = 4: scale so that the L-nO[1] norm equals nO[2]
// normalization = none
// normalization = absval 1 lm
// normalization = maxabsval 1
// normalization = minabsval 1
// normalization = LNorm 2 1
dummy = (origLine.substring(origLine.indexOf("=") + 1)).trim();
String[] dummyA = dummy.split("\\s+");
if (dummyA[0].equals("none")) {
normalizationOptions[0] = 0;
} else if (dummyA[0].equals("absval")) {
normalizationOptions[0] = 1;
normalizationOptions[1] = Double.parseDouble(dummyA[1]);
String pName = dummyA[2];
for (int i = 3; i < dummyA.length; ++i) { // in case parameter name has multiple words
pName = pName + " " + dummyA[i];
}
normalizationOptions[2] = Vocabulary.id(pName);
if (normalizationOptions[1] <= 0) {
println("Value for the absval normalization method must be positive.");
System.exit(21);
}
if (normalizationOptions[2] == 0) {
println("Unrecognized feature name " + normalizationOptions[2]
+ " for absval normalization method.", 1);
System.exit(21);
}
} else if (dummyA[0].equals("maxabsval")) {
normalizationOptions[0] = 2;
normalizationOptions[1] = Double.parseDouble(dummyA[1]);
if (normalizationOptions[1] <= 0) {
println("Value for the maxabsval normalization method must be positive.");
System.exit(21);
}
} else if (dummyA[0].equals("minabsval")) {
normalizationOptions[0] = 3;
normalizationOptions[1] = Double.parseDouble(dummyA[1]);
if (normalizationOptions[1] <= 0) {
println("Value for the minabsval normalization method must be positive.");
System.exit(21);
}
} else if (dummyA[0].equals("LNorm")) {
normalizationOptions[0] = 4;
normalizationOptions[1] = Double.parseDouble(dummyA[1]);
normalizationOptions[2] = Double.parseDouble(dummyA[2]);
if (normalizationOptions[1] <= 0 || normalizationOptions[2] <= 0) {
println("Both values for the LNorm normalization method must be positive.");
System.exit(21);
}
} else {
println("Unrecognized normalization method " + dummyA[0] + "; "
+ "must be one of none, absval, maxabsval, and LNorm.");
System.exit(21);
} // if (dummyA[0])
inFile_init.close();
} // processParamFile()
private void processDocInfo() {
// sets numDocuments and docOfSentence[]
docOfSentence = new int[numSentences];
if (docInfoFileName == null) {
for (int i = 0; i < numSentences; ++i)
docOfSentence[i] = 0;
numDocuments = 1;
} else {
try {
// 4 possible formats:
// 1) List of numbers, one per document, indicating # sentences in each document.
// 2) List of "docName size" pairs, one per document, indicating name of document and #
// sentences.
// 3) List of docName's, one per sentence, indicating which doument each sentence belongs
// to.
// 4) List of docName_number's, one per sentence, indicating which doument each sentence
// belongs to,
// and its order in that document. (can also use '-' instead of '_')
int docInfoSize = countNonEmptyLines(docInfoFileName);
if (docInfoSize < numSentences) { // format #1 or #2
numDocuments = docInfoSize;
int i = 0;
BufferedReader inFile = new BufferedReader(new FileReader(docInfoFileName));
String line = inFile.readLine();
boolean format1 = (!(line.contains(" ")));
for (int doc = 0; doc < numDocuments; ++doc) {
if (doc != 0)
line = inFile.readLine();
int docSize = 0;
if (format1) {
docSize = Integer.parseInt(line);
} else {
docSize = Integer.parseInt(line.split("\\s+")[1]);
}
for (int i2 = 1; i2 <= docSize; ++i2) {
docOfSentence[i] = doc;
++i;
}
}
// now i == numSentences
inFile.close();
} else if (docInfoSize == numSentences) { // format #3 or #4
boolean format3 = false;
HashSet<String> seenStrings = new HashSet<String>();
BufferedReader inFile = new BufferedReader(new FileReader(docInfoFileName));
for (int i = 0; i < numSentences; ++i) {
// set format3 = true if a duplicate is found
String line = inFile.readLine();
if (seenStrings.contains(line))
format3 = true;
seenStrings.add(line);
}
inFile.close();
HashSet<String> seenDocNames = new HashSet<String>();
HashMap<String, Integer> docOrder = new HashMap<String, Integer>();
// maps a document name to the order (0-indexed) in which it was seen
inFile = new BufferedReader(new FileReader(docInfoFileName));
for (int i = 0; i < numSentences; ++i) {
String line = inFile.readLine();
String docName = "";
if (format3) {
docName = line;
} else {
int sep_i = Math.max(line.lastIndexOf('_'), line.lastIndexOf('-'));
docName = line.substring(0, sep_i);
}
if (!seenDocNames.contains(docName)) {
seenDocNames.add(docName);
docOrder.put(docName, seenDocNames.size() - 1);
}
int docOrder_i = docOrder.get(docName);
docOfSentence[i] = docOrder_i;
}
inFile.close();
numDocuments = seenDocNames.size();
} else { // badly formatted
}
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MIRACore.processDocInfo(): " + e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MIRACore.processDocInfo(): " + e.getMessage());
System.exit(99902);
}
}
}
private boolean copyFile(String origFileName, String newFileName) {
try {
File inputFile = new File(origFileName);
File outputFile = new File(newFileName);
InputStream in = new FileInputStream(inputFile);
OutputStream out = new FileOutputStream(outputFile);
byte[] buffer = new byte[1024];
int len;
while ((len = in.read(buffer)) > 0) {
out.write(buffer, 0, len);
}
in.close();
out.close();
/*
* InputStream inStream = new FileInputStream(new File(origFileName)); BufferedReader inFile =
* new BufferedReader(new InputStreamReader(inStream, "utf8"));
*
* FileOutputStream outStream = new FileOutputStream(newFileName, false); OutputStreamWriter
* outStreamWriter = new OutputStreamWriter(outStream, "utf8"); BufferedWriter outFile = new
* BufferedWriter(outStreamWriter);
*
* String line; while(inFile.ready()) { line = inFile.readLine(); writeLine(line, outFile); }
*
* inFile.close(); outFile.close();
*/
return true;
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MIRACore.copyFile(String,String): "
+ e.getMessage());
return false;
} catch (IOException e) {
System.err.println("IOException in MIRACore.copyFile(String,String): " + e.getMessage());
return false;
}
}
private void renameFile(String origFileName, String newFileName) {
if (fileExists(origFileName)) {
deleteFile(newFileName);
File oldFile = new File(origFileName);
File newFile = new File(newFileName);
if (!oldFile.renameTo(newFile)) {
println("Warning: attempt to rename " + origFileName + " to " + newFileName
+ " was unsuccessful!", 1);
}
} else {
println("Warning: file " + origFileName + " does not exist! (in MIRACore.renameFile)", 1);
}
}
private void deleteFile(String fileName) {
if (fileExists(fileName)) {
File fd = new File(fileName);
if (!fd.delete()) {
println("Warning: attempt to delete " + fileName + " was unsuccessful!", 1);
}
}
}
private void writeLine(String line, BufferedWriter writer) throws IOException {
writer.write(line, 0, line.length());
writer.newLine();
writer.flush();
}
// need to re-write to handle different forms of lambda
public void finish() {
if (myDecoder != null) {
myDecoder.cleanUp();
}
// create config file with final values
createConfigFile(lambda, decoderConfigFileName + ".MIRA.final", decoderConfigFileName
+ ".MIRA.orig");
// delete current decoder config file and decoder output
deleteFile(decoderConfigFileName);
deleteFile(decoderOutFileName);
// restore original name for config file (name was changed
// in initialize() so it doesn't get overwritten)
renameFile(decoderConfigFileName + ".MIRA.orig", decoderConfigFileName);
if (finalLambdaFileName != null) {
try {
PrintWriter outFile_lambdas = new PrintWriter(finalLambdaFileName);
for (int c = 1; c <= numParams; ++c) {
outFile_lambdas.println(Vocabulary.word(c) + " ||| " + lambda.get(c).doubleValue());
}
outFile_lambdas.close();
} catch (IOException e) {
System.err.println("IOException in MIRACore.finish(): " + e.getMessage());
System.exit(99902);
}
}
}
private String[] cfgFileToArgsArray(String fileName) {
checkFile(fileName);
Vector<String> argsVector = new Vector<String>();
BufferedReader inFile = null;
try {
inFile = new BufferedReader(new FileReader(fileName));
String line, origLine;
do {
line = inFile.readLine();
origLine = line; // for error reporting purposes
if (line != null && line.length() > 0 && line.charAt(0) != '#') {
if (line.indexOf("#") != -1) { // discard comment
line = line.substring(0, line.indexOf("#"));
}
line = line.trim();
// now line should look like "-xxx XXX"
/*
* OBSOLETE MODIFICATION //SPECIAL HANDLING FOR MIRA CLASSIFIER PARAMETERS String[] paramA
* = line.split("\\s+");
*
* if( paramA[0].equals("-classifierParams") ) { String classifierParam = ""; for(int p=1;
* p<=paramA.length-1; p++) classifierParam += paramA[p]+" ";
*
* if(paramA.length>=2) { String[] tmpParamA = new String[2]; tmpParamA[0] = paramA[0];
* tmpParamA[1] = classifierParam; paramA = tmpParamA; } else {
* println("Malformed line in config file:"); println(origLine); System.exit(70); } }//END
* MODIFICATION
*/
// cmu modification(from meteor for zmert)
// Parse args
ArrayList<String> argList = new ArrayList<String>();
StringBuilder arg = new StringBuilder();
boolean quoted = false;
for (int i = 0; i < line.length(); i++) {
if (Character.isWhitespace(line.charAt(i))) {
if (quoted)
arg.append(line.charAt(i));
else if (arg.length() > 0) {
argList.add(arg.toString());
arg = new StringBuilder();
}
} else if (line.charAt(i) == '\'') {
if (quoted) {
argList.add(arg.toString());
arg = new StringBuilder();
}
quoted = !quoted;
} else
arg.append(line.charAt(i));
}
if (arg.length() > 0)
argList.add(arg.toString());
// Create paramA
String[] paramA = new String[argList.size()];
for (int i = 0; i < paramA.length; paramA[i] = argList.get(i++))
;
// END CMU MODIFICATION
if (paramA.length == 2 && paramA[0].charAt(0) == '-') {
argsVector.add(paramA[0]);
argsVector.add(paramA[1]);
} else if (paramA.length > 2
&& (paramA[0].equals("-m") || paramA[0].equals("-docSet"))) {
// -m (metricName), -docSet are allowed to have extra optinos
for (int opt = 0; opt < paramA.length; ++opt) {
argsVector.add(paramA[opt]);
}
} else {
println("Malformed line in config file:");
println(origLine);
System.exit(70);
}
}
} while (line != null);
inFile.close();
} catch (FileNotFoundException e) {
println("MIRA configuration file " + fileName + " was not found!");
System.err.println("FileNotFoundException in MIRACore.cfgFileToArgsArray(String): "
+ e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MIRACore.cfgFileToArgsArray(String): " + e.getMessage());
System.exit(99902);
}
String[] argsArray = new String[argsVector.size()];
for (int i = 0; i < argsVector.size(); ++i) {
argsArray[i] = argsVector.elementAt(i);
}
return argsArray;
}
private void processArgsArray(String[] args) {
processArgsArray(args, true);
}
private void processArgsArray(String[] args, boolean firstTime) {
/* set default values */
// Relevant files
dirPrefix = null;
sourceFileName = null;
refFileName = "reference.txt";
refsPerSen = 1;
textNormMethod = 1;
paramsFileName = "params.txt";
docInfoFileName = null;
finalLambdaFileName = null;
// MERT specs
metricName = "BLEU";
metricName_display = metricName;
metricOptions = new String[2];
metricOptions[0] = "4";
metricOptions[1] = "closest";
docSubsetInfo = new int[7];
docSubsetInfo[0] = 0;
maxMERTIterations = 20;
prevMERTIterations = 20;
minMERTIterations = 5;
stopMinIts = 3;
stopSigValue = -1;
//
// /* possibly other early stopping criteria here */
//
numOptThreads = 1;
saveInterFiles = 3;
compressFiles = 0;
oneModificationPerIteration = false;
randInit = false;
seed = System.currentTimeMillis();
// useDisk = 2;
// Decoder specs
decoderCommandFileName = null;
passIterationToDecoder = false;
decoderOutFileName = "output.nbest";
validDecoderExitValue = 0;
decoderConfigFileName = "dec_cfg.txt";
sizeOfNBest = 100;
fakeFileNameTemplate = null;
fakeFileNamePrefix = null;
fakeFileNameSuffix = null;
// Output specs
verbosity = 1;
decVerbosity = 0;
int i = 0;
while (i < args.length) {
String option = args[i];
// Relevant files
if (option.equals("-dir")) {
dirPrefix = args[i + 1];
} else if (option.equals("-s")) {
sourceFileName = args[i + 1];
} else if (option.equals("-r")) {
refFileName = args[i + 1];
} else if (option.equals("-rps")) {
refsPerSen = Integer.parseInt(args[i + 1]);
if (refsPerSen < 1) {
println("refsPerSen must be positive.");
System.exit(10);
}
} else if (option.equals("-txtNrm")) {
textNormMethod = Integer.parseInt(args[i + 1]);
if (textNormMethod < 0 || textNormMethod > 4) {
println("textNormMethod should be between 0 and 4");
System.exit(10);
}
} else if (option.equals("-p")) {
paramsFileName = args[i + 1];
} else if (option.equals("-docInfo")) {
docInfoFileName = args[i + 1];
} else if (option.equals("-fin")) {
finalLambdaFileName = args[i + 1];
// MERT specs
} else if (option.equals("-m")) {
metricName = args[i + 1];
metricName_display = metricName;
if (EvaluationMetric.knownMetricName(metricName)) {
int optionCount = EvaluationMetric.metricOptionCount(metricName);
metricOptions = new String[optionCount];
for (int opt = 0; opt < optionCount; ++opt) {
metricOptions[opt] = args[i + opt + 2];
}
i += optionCount;
} else {
println("Unknown metric name " + metricName + ".");
System.exit(10);
}
} else if (option.equals("-docSet")) {
String method = args[i + 1];
if (method.equals("all")) {
docSubsetInfo[0] = 0;
i += 0;
} else if (method.equals("bottom")) {
String a = args[i + 2];
if (a.endsWith("d")) {
docSubsetInfo[0] = 1;
a = a.substring(0, a.indexOf("d"));
} else {
docSubsetInfo[0] = 2;
a = a.substring(0, a.indexOf("%"));
}
docSubsetInfo[5] = Integer.parseInt(a);
i += 1;
} else if (method.equals("top")) {
String a = args[i + 2];
if (a.endsWith("d")) {
docSubsetInfo[0] = 3;
a = a.substring(0, a.indexOf("d"));
} else {
docSubsetInfo[0] = 4;
a = a.substring(0, a.indexOf("%"));
}
docSubsetInfo[5] = Integer.parseInt(a);
i += 1;
} else if (method.equals("window")) {
String a1 = args[i + 2];
a1 = a1.substring(0, a1.indexOf("d")); // size of window
String a2 = args[i + 4];
if (a2.indexOf("p") > 0) {
docSubsetInfo[0] = 5;
a2 = a2.substring(0, a2.indexOf("p"));
} else {
docSubsetInfo[0] = 6;
a2 = a2.substring(0, a2.indexOf("r"));
}
docSubsetInfo[5] = Integer.parseInt(a1);
docSubsetInfo[6] = Integer.parseInt(a2);
i += 3;
} else {
println("Unknown docSet method " + method + ".");
System.exit(10);
}
} else if (option.equals("-maxIt")) {
maxMERTIterations = Integer.parseInt(args[i + 1]);
if (maxMERTIterations < 1) {
println("maxIt must be positive.");
System.exit(10);
}
} else if (option.equals("-minIt")) {
minMERTIterations = Integer.parseInt(args[i + 1]);
if (minMERTIterations < 1) {
println("minIt must be positive.");
System.exit(10);
}
} else if (option.equals("-prevIt")) {
prevMERTIterations = Integer.parseInt(args[i + 1]);
if (prevMERTIterations < 0) {
println("prevIt must be non-negative.");
System.exit(10);
}
} else if (option.equals("-stopIt")) {
stopMinIts = Integer.parseInt(args[i + 1]);
if (stopMinIts < 1) {
println("stopIts must be positive.");
System.exit(10);
}
} else if (option.equals("-stopSig")) {
stopSigValue = Double.parseDouble(args[i + 1]);
}
//
// /* possibly other early stopping criteria here */
//
else if (option.equals("-thrCnt")) {
numOptThreads = Integer.parseInt(args[i + 1]);
if (numOptThreads < 1) {
println("threadCount must be positive.");
System.exit(10);
}
} else if (option.equals("-save")) {
saveInterFiles = Integer.parseInt(args[i + 1]);
if (saveInterFiles < 0 || saveInterFiles > 3) {
println("save should be between 0 and 3");
System.exit(10);
}
} else if (option.equals("-compress")) {
compressFiles = Integer.parseInt(args[i + 1]);
if (compressFiles < 0 || compressFiles > 1) {
println("compressFiles should be either 0 or 1");
System.exit(10);
}
} else if (option.equals("-opi")) {
int opi = Integer.parseInt(args[i + 1]);
if (opi == 1) {
oneModificationPerIteration = true;
} else if (opi == 0) {
oneModificationPerIteration = false;
} else {
println("oncePerIt must be either 0 or 1.");
System.exit(10);
}
} else if (option.equals("-rand")) {
int rand = Integer.parseInt(args[i + 1]);
if (rand == 1) {
randInit = true;
} else if (rand == 0) {
randInit = false;
} else {
println("randInit must be either 0 or 1.");
System.exit(10);
}
} else if (option.equals("-seed")) {
if (args[i + 1].equals("time")) {
seed = System.currentTimeMillis();
} else {
seed = Long.parseLong(args[i + 1]);
}
}
/*
* else if (option.equals("-ud")) { useDisk = Integer.parseInt(args[i+1]); if (useDisk < 0 ||
* useDisk > 2) { println("useDisk should be between 0 and 2"); System.exit(10); } }
*/
// for mira:
else if (option.equals("-needShuffle")) {
int shuffle = Integer.parseInt(args[i + 1]);
if(shuffle==1)
needShuffle = true;
else if(shuffle==0)
needShuffle = false;
else {
println("-needShuffle must be either 0 or 1.");
System.exit(10);
}
}
//average weights after each epoch or not
else if (option.equals("-needAvg")) {
int avg = Integer.parseInt(args[i + 1]);
if(avg==1)
needAvg = true;
else if(avg==0)
needAvg = false;
else {
println("-needAvg must be either 0 or 1.");
System.exit(10);
}
}
//return the best weight during tuning or not
else if (option.equals("-returnBest")) {
int retBest = Integer.parseInt(args[i + 1]);
if(retBest == 1)
returnBest = true;
else if(retBest == 0)
returnBest = false;
else {
println("-returnBest must be either 0 or 1.");
System.exit(10);
}
}
//run perceptron or not
else if (option.equals("-runPercep")) {
int per = Integer.parseInt(args[i + 1]);
if(per==1)
runPercep = true;
else if(per==0)
runPercep = false;
else {
println("-runPercep must be either 0 or 1.");
System.exit(10);
}
}
// oracle selection mode
else if (option.equals("-oracleSelection")) {
oraSelectMode = Integer.parseInt(args[i + 1]);
}
// prediction selection mode
else if (option.equals("-predictionSelection")) {
predSelectMode = Integer.parseInt(args[i + 1]);
}
// MIRA internal iterations
else if (option.equals("-miraIter")) {
miraIter = Integer.parseInt(args[i + 1]);
}
// relaxation coefficient
else if (option.equals("-C")) {
C = Double.parseDouble(args[i + 1]);
}
// else if (option.equals("-sentForScaling")) {
// sentForScale = Double.parseDouble(args[i + 1]);
// if(sentForScale>1 || sentForScale<0) {
// println("-sentForScaling must be in [0,1]");
// System.exit(10);
// }
// }
else if (option.equals("-scoreRatio")) {
scoreRatio = Double.parseDouble(args[i + 1]);
if(scoreRatio<=0) {
println("-scoreRatio must be positive");
System.exit(10);
}
}
else if (option.equals("-needScaling")) {
int scale = Integer.parseInt(args[i + 1]);
if(scale==1)
needScale = true;
else if(scale==0)
needScale = false;
else {
println("-needScaling must be either 0 or 1.");
System.exit(10);
}
}
else if (option.equals("-usePseudoCorpus")) {
int use = Integer.parseInt(args[i + 1]);
if(use==1)
usePseudoBleu = true;
else if(use==0)
usePseudoBleu = false;
else {
println("-usePseudoCorpus must be either 0 or 1.");
System.exit(10);
}
}
else if (option.equals("-corpusDecay")) {
R = Double.parseDouble(args[i + 1]);
}
// Decoder specs
else if (option.equals("-cmd")) {
decoderCommandFileName = args[i + 1];
} else if (option.equals("-passIt")) {
int val = Integer.parseInt(args[i + 1]);
if (val < 0 || val > 1) {
println("passIterationToDecoder should be either 0 or 1");
System.exit(10);
}
passIterationToDecoder = (val == 1) ? true : false;
} else if (option.equals("-decOut")) {
decoderOutFileName = args[i + 1];
} else if (option.equals("-decExit")) {
validDecoderExitValue = Integer.parseInt(args[i + 1]);
} else if (option.equals("-dcfg")) {
decoderConfigFileName = args[i + 1];
} else if (option.equals("-N")) {
sizeOfNBest = Integer.parseInt(args[i + 1]);
if (sizeOfNBest < 1) {
println("N must be positive.");
System.exit(10);
}
}
// Output specs
else if (option.equals("-v")) {
verbosity = Integer.parseInt(args[i + 1]);
if (verbosity < 0 || verbosity > 4) {
println("verbosity should be between 0 and 4");
System.exit(10);
}
} else if (option.equals("-decV")) {
decVerbosity = Integer.parseInt(args[i + 1]);
if (decVerbosity < 0 || decVerbosity > 1) {
println("decVerbosity should be either 0 or 1");
System.exit(10);
}
} else if (option.equals("-fake")) {
fakeFileNameTemplate = args[i + 1];
int QM_i = fakeFileNameTemplate.indexOf("?");
if (QM_i <= 0) {
println("fakeFileNameTemplate must contain '?' to indicate position of iteration number");
System.exit(10);
}
fakeFileNamePrefix = fakeFileNameTemplate.substring(0, QM_i);
fakeFileNameSuffix = fakeFileNameTemplate.substring(QM_i + 1);
} else {
println("Unknown option " + option);
System.exit(10);
}
i += 2;
} // while (i)
if (maxMERTIterations < minMERTIterations) {
if (firstTime)
println("Warning: maxMERTIts is smaller than minMERTIts; " + "decreasing minMERTIts from "
+ minMERTIterations + " to maxMERTIts " + "(i.e. " + maxMERTIterations + ").", 1);
minMERTIterations = maxMERTIterations;
}
if (dirPrefix != null) { // append dirPrefix to file names
refFileName = fullPath(dirPrefix, refFileName);
decoderOutFileName = fullPath(dirPrefix, decoderOutFileName);
paramsFileName = fullPath(dirPrefix, paramsFileName);
decoderConfigFileName = fullPath(dirPrefix, decoderConfigFileName);
if (sourceFileName != null) {
sourceFileName = fullPath(dirPrefix, sourceFileName);
}
if (docInfoFileName != null) {
docInfoFileName = fullPath(dirPrefix, docInfoFileName);
}
if (finalLambdaFileName != null) {
finalLambdaFileName = fullPath(dirPrefix, finalLambdaFileName);
}
if (decoderCommandFileName != null) {
decoderCommandFileName = fullPath(dirPrefix, decoderCommandFileName);
}
if (fakeFileNamePrefix != null) {
fakeFileNamePrefix = fullPath(dirPrefix, fakeFileNamePrefix);
}
}
// TODO: make this an argument
// TODO: also use this for the state file? could be tricky, since that file is created by
// ZMERT.java
// TODO: change name from tmpDirPrefix to tmpFilePrefix?
int k = decoderOutFileName.lastIndexOf("/");
if (k >= 0) {
tmpDirPrefix = decoderOutFileName.substring(0, k + 1) + "MIRA.";
} else {
tmpDirPrefix = "MIRA.";
}
println("tmpDirPrefix: " + tmpDirPrefix);
checkFile(paramsFileName);
checkFile(decoderConfigFileName);
boolean canRunCommand = fileExists(decoderCommandFileName);
if (decoderCommandFileName != null && !canRunCommand) {
// i.e. a decoder command file was specified, but it was not found
if (firstTime)
println("Warning: specified decoder command file " + decoderCommandFileName
+ " was not found.", 1);
}
boolean canRunJoshua = fileExists(sourceFileName);
if (sourceFileName != null && !canRunJoshua) {
// i.e. a source file was specified, but it was not found
if (firstTime)
println("Warning: specified source file " + sourceFileName + " was not found.", 1);
}
boolean canRunFake = (fakeFileNameTemplate != null);
if (!canRunCommand && !canRunJoshua) { // can only run fake decoder
if (!canRunFake) {
println("MIRA cannot decode; must provide one of: command file (for external decoder),");
println(" source file (for Joshua decoder),");
println(" or prefix for existing output files (for fake decoder).");
System.exit(12);
}
int lastGoodIt = 0;
for (int it = 1; it <= maxMERTIterations; ++it) {
if (fileExists(fakeFileNamePrefix + it + fakeFileNameSuffix)) {
lastGoodIt = it;
} else {
break; // from for (it) loop
}
}
if (lastGoodIt == 0) {
println("Fake decoder cannot find first output file "
+ (fakeFileNamePrefix + 1 + fakeFileNameSuffix));
System.exit(13);
} else if (lastGoodIt < maxMERTIterations) {
if (firstTime)
println("Warning: can only run fake decoder; existing output files "
+ "are only available for the first " + lastGoodIt + " iteration(s).", 1);
}
}
if (refsPerSen > 1) {
// the provided refFileName might be a prefix
File dummy = new File(refFileName);
if (!dummy.exists()) {
refFileName = createUnifiedRefFile(refFileName, refsPerSen);
}
} else {
checkFile(refFileName);
}
if (firstTime) {
println("Processed the following args array:", 1);
print(" ", 1);
for (i = 0; i < args.length; ++i) {
print(args[i] + " ", 1);
}
println("", 1);
println("", 1);
}
} // processArgs(String[] args)
private void set_docSubsetInfo(int[] info) {
/*
* 1: -docSet bottom 8d 2: -docSet bottom 25% the bottom ceil(0.20*numDocs) documents 3: -docSet
* top 8d 4: -docSet top 25% the top ceil(0.20*numDocs) documents
*
* 5: -docSet window 11d around 90percentile 11 docs centered around 80th percentile (complain
* if not enough docs; don't adjust) 6: -docSet window 11d around 40rank 11 docs centered around
* doc ranked 50 (complain if not enough docs; don't adjust)
*
*
* [0]: method (0-6) [1]: first (1-indexed) [2]: last (1-indexed) [3]: size [4]: center [5]:
* arg1 (-1 for method 0) [6]: arg2 (-1 for methods 0-4)
*/
if (info[0] == 0) { // all
info[1] = 1;
info[2] = numDocuments;
info[3] = numDocuments;
info[4] = (info[1] + info[2]) / 2;
}
if (info[0] == 1) { // bottom d
info[3] = info[5];
info[2] = numDocuments;
info[1] = numDocuments - info[3] + 1;
info[4] = (info[1] + info[2]) / 2;
}
if (info[0] == 2) { // bottom p
info[3] = (int) (Math.ceil((info[5] / 100.0) * numDocuments));
info[2] = numDocuments;
info[1] = numDocuments - info[3] + 1;
info[4] = (info[1] + info[2]) / 2;
}
if (info[0] == 3) { // top d
info[3] = info[5];
info[1] = 1;
info[2] = info[3];
info[4] = (info[1] + info[2]) / 2;
}
if (info[0] == 4) { // top p
info[3] = (int) (Math.ceil((info[5] / 100.0) * numDocuments));
info[1] = 1;
info[2] = info[3];
info[4] = (info[1] + info[2]) / 2;
}
if (info[0] == 5) { // window around percentile
info[3] = info[5];
info[4] = (int) (Math.floor((info[6] / 100.0) * numDocuments));
info[1] = info[4] - ((info[3] - 1) / 2);
info[2] = info[4] + ((info[3] - 1) / 2);
}
if (info[0] == 6) { // window around rank
info[3] = info[5];
info[4] = info[6];
info[1] = info[4] - ((info[3] - 1) / 2);
info[2] = info[4] + ((info[3] - 1) / 2);
}
}
private void checkFile(String fileName) {
if (!fileExists(fileName)) {
println("The file " + fileName + " was not found!");
System.exit(40);
}
}
private boolean fileExists(String fileName) {
if (fileName == null)
return false;
File checker = new File(fileName);
return checker.exists();
}
private void gzipFile(String inputFileName) {
gzipFile(inputFileName, inputFileName + ".gz");
}
private void gzipFile(String inputFileName, String gzippedFileName) {
// NOTE: this will delete the original file
try {
FileInputStream in = new FileInputStream(inputFileName);
GZIPOutputStream out = new GZIPOutputStream(new FileOutputStream(gzippedFileName));
byte[] buffer = new byte[4096];
int len;
while ((len = in.read(buffer)) > 0) {
out.write(buffer, 0, len);
}
in.close();
out.finish();
out.close();
deleteFile(inputFileName);
} catch (IOException e) {
System.err.println("IOException in MIRACore.gzipFile(String,String): " + e.getMessage());
System.exit(99902);
}
}
@SuppressWarnings("unused")
private void gunzipFile(String gzippedFileName) {
if (gzippedFileName.endsWith(".gz")) {
gunzipFile(gzippedFileName, gzippedFileName.substring(0, gzippedFileName.length() - 3));
} else {
gunzipFile(gzippedFileName, gzippedFileName + ".dec");
}
}
private void gunzipFile(String gzippedFileName, String outputFileName) {
// NOTE: this will delete the original file
try {
GZIPInputStream in = new GZIPInputStream(new FileInputStream(gzippedFileName));
FileOutputStream out = new FileOutputStream(outputFileName);
byte[] buffer = new byte[4096];
int len;
while ((len = in.read(buffer)) > 0) {
out.write(buffer, 0, len);
}
in.close();
out.close();
deleteFile(gzippedFileName);
} catch (IOException e) {
System.err.println("IOException in MIRACore.gunzipFile(String,String): " + e.getMessage());
System.exit(99902);
}
}
private String createUnifiedRefFile(String prefix, int numFiles) {
if (numFiles < 2) {
println("Warning: createUnifiedRefFile called with numFiles = " + numFiles + "; "
+ "doing nothing.", 1);
return prefix;
} else {
File checker;
checker = new File(prefix + "1");
if (!checker.exists()) {
checker = new File(prefix + ".1");
if (!checker.exists()) {
println("Can't find reference files.");
System.exit(50);
} else {
prefix = prefix + ".";
}
}
String outFileName;
if (prefix.endsWith(".")) {
outFileName = prefix + "all";
} else {
outFileName = prefix + ".all";
}
try {
PrintWriter outFile = new PrintWriter(outFileName);
BufferedReader[] inFile = new BufferedReader[numFiles];
int nextIndex;
checker = new File(prefix + "0");
if (checker.exists()) {
nextIndex = 0;
} else {
nextIndex = 1;
}
int lineCount = countLines(prefix + nextIndex);
for (int r = 0; r < numFiles; ++r) {
if (countLines(prefix + nextIndex) != lineCount) {
println("Line count mismatch in " + (prefix + nextIndex) + ".");
System.exit(60);
}
InputStream inStream = new FileInputStream(new File(prefix + nextIndex));
inFile[r] = new BufferedReader(new InputStreamReader(inStream, "utf8"));
++nextIndex;
}
String line;
for (int i = 0; i < lineCount; ++i) {
for (int r = 0; r < numFiles; ++r) {
line = inFile[r].readLine();
outFile.println(line);
}
}
outFile.close();
for (int r = 0; r < numFiles; ++r) {
inFile[r].close();
}
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in MIRACore.createUnifiedRefFile(String,int): "
+ e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in MIRACore.createUnifiedRefFile(String,int): "
+ e.getMessage());
System.exit(99902);
}
return outFileName;
}
} // createUnifiedRefFile(String prefix, int numFiles)
private String normalize(String str, int normMethod) {
if (normMethod == 0)
return str;
// replace HTML/SGML
str = str.replaceAll("&quot;", "\"");
str = str.replaceAll("&amp;", "&");
str = str.replaceAll("&lt;", "<");
str = str.replaceAll("&gt;", ">");
str = str.replaceAll("&apos;", "'");
// split on these characters:
// ! " # $ % & ( ) * + / : ; < = > ? @ [ \ ] ^ _ ` { | } ~
// i.e. ASCII 33-126, except alphanumeric, and except "," "-" "." "'"
// ! "# $%& ( ) * +/:;<=> ?@ [ \ ] ^_` { | }~
String split_on = "!\"#\\$%&\\(\\)\\*\\+/:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}~";
// println("split_on: " + split_on);
for (int k = 0; k < split_on.length(); ++k) {
// for each split character, reprocess the string
String regex = "" + split_on.charAt(k);
if (regex.equals("\\")) {
++k;
regex += split_on.charAt(k);
}
str = str.replaceAll(regex, " " + regex + " ");
}
// split on "." and "," and "-", conditioned on proper context
str = " " + str + " ";
str = str.replaceAll("\\s+", " ");
TreeSet<Integer> splitIndices = new TreeSet<Integer>();
for (int i = 0; i < str.length(); ++i) {
char ch = str.charAt(i);
if (ch == '.' || ch == ',') {
// split if either of the previous or next characters is a non-digit
char prev_ch = str.charAt(i - 1);
char next_ch = str.charAt(i + 1);
if (prev_ch < '0' || prev_ch > '9' || next_ch < '0' || next_ch > '9') {
splitIndices.add(i);
}
} else if (ch == '-') {
// split if preceded by a digit
char prev_ch = str.charAt(i - 1);
if (prev_ch >= '0' && prev_ch <= '9') {
splitIndices.add(i);
}
}
}
String str0 = str;
str = "";
for (int i = 0; i < str0.length(); ++i) {
if (splitIndices.contains(i)) {
str += " " + str0.charAt(i) + " ";
} else {
str += str0.charAt(i);
}
}
// rejoin i'm, we're, *'s, won't, don't, etc
str = " " + str + " ";
str = str.replaceAll("\\s+", " ");
str = str.replaceAll(" i 'm ", " i'm ");
str = str.replaceAll(" we 're ", " we're ");
str = str.replaceAll(" 's ", "'s ");
str = str.replaceAll(" 've ", "'ve ");
str = str.replaceAll(" 'll ", "'ll ");
str = str.replaceAll(" 'd ", "'d ");
str = str.replaceAll(" n't ", "n't ");
// remove spaces around dashes
if (normMethod == 2 || normMethod == 4) {
TreeSet<Integer> skipIndices = new TreeSet<Integer>();
str = " " + str + " ";
for (int i = 0; i < str.length(); ++i) {
char ch = str.charAt(i);
if (ch == '-') {
// rejoin if surrounded by spaces, and then letters
if (str.charAt(i - 1) == ' ' && str.charAt(i + 1) == ' ') {
if (Character.isLetter(str.charAt(i - 2)) && Character.isLetter(str.charAt(i + 2))) {
skipIndices.add(i - 1);
skipIndices.add(i + 1);
}
}
}
}
str0 = str;
str = "";
for (int i = 0; i < str0.length(); ++i) {
if (!skipIndices.contains(i)) {
str += str0.charAt(i);
}
}
}
// drop non-ASCII characters
if (normMethod == 3 || normMethod == 4) {
str0 = str;
str = "";
for (int i = 0; i < str0.length(); ++i) {
char ch = str0.charAt(i);
if (ch <= 127) { // i.e. if ASCII
str += ch;
}
}
}
str = str.replaceAll("\\s+", " ");
str = str.trim();
return str;
}
private int countLines(String fileName) {
int count = 0;
try {
BufferedReader inFile = new BufferedReader(new FileReader(fileName));
String line;
do {
line = inFile.readLine();
if (line != null)
++count;
} while (line != null);
inFile.close();
} catch (IOException e) {
System.err.println("IOException in MIRACore.countLines(String): " + e.getMessage());
System.exit(99902);
}
return count;
}
private int countNonEmptyLines(String fileName) {
int count = 0;
try {
BufferedReader inFile = new BufferedReader(new FileReader(fileName));
String line;
do {
line = inFile.readLine();
if (line != null && line.length() > 0)
++count;
} while (line != null);
inFile.close();
} catch (IOException e) {
System.err.println("IOException in MIRACore.countNonEmptyLines(String): " + e.getMessage());
System.exit(99902);
}
return count;
}
private String fullPath(String dir, String fileName) {
File dummyFile = new File(dir, fileName);
return dummyFile.getAbsolutePath();
}
@SuppressWarnings("unused")
private void cleanupMemory() {
cleanupMemory(100, false);
}
@SuppressWarnings("unused")
private void cleanupMemorySilently() {
cleanupMemory(100, true);
}
@SuppressWarnings("static-access")
private void cleanupMemory(int reps, boolean silent) {
int bytesPerMB = 1024 * 1024;
long totalMemBefore = myRuntime.totalMemory();
long freeMemBefore = myRuntime.freeMemory();
long usedMemBefore = totalMemBefore - freeMemBefore;
long usedCurr = usedMemBefore;
long usedPrev = usedCurr;
// perform garbage collection repeatedly, until there is no decrease in
// the amount of used memory
for (int i = 1; i <= reps; ++i) {
myRuntime.runFinalization();
myRuntime.gc();
(Thread.currentThread()).yield();
usedPrev = usedCurr;
usedCurr = myRuntime.totalMemory() - myRuntime.freeMemory();
if (usedCurr == usedPrev)
break;
}
if (!silent) {
long totalMemAfter = myRuntime.totalMemory();
long freeMemAfter = myRuntime.freeMemory();
long usedMemAfter = totalMemAfter - freeMemAfter;
println("GC: d_used = " + ((usedMemAfter - usedMemBefore) / bytesPerMB) + " MB "
+ "(d_tot = " + ((totalMemAfter - totalMemBefore) / bytesPerMB) + " MB).", 2);
}
}
@SuppressWarnings("unused")
private void printMemoryUsage() {
int bytesPerMB = 1024 * 1024;
long totalMem = myRuntime.totalMemory();
long freeMem = myRuntime.freeMemory();
long usedMem = totalMem - freeMem;
println("Allocated memory: " + (totalMem / bytesPerMB) + " MB " + "(of which "
+ (usedMem / bytesPerMB) + " MB is being used).", 2);
}
private void println(Object obj, int priority) {
if (priority <= verbosity)
println(obj);
}
private void print(Object obj, int priority) {
if (priority <= verbosity)
print(obj);
}
private void println(Object obj) {
System.out.println(obj);
}
private void print(Object obj) {
System.out.print(obj);
}
@SuppressWarnings("unused")
private void showProgress() {
++progress;
if (progress % 100000 == 0)
print(".", 2);
}
private ArrayList<Double> randomLambda() {
ArrayList<Double> retLambda = new ArrayList<Double>(1+numParams);
for (int c = 1; c <= numParams; ++c) {
if (isOptimizable[c]) {
double randVal = randGen.nextDouble(); // number in [0.0,1.0]
++generatedRands;
randVal = randVal * (maxRandValue[c] - minRandValue[c]); // number in [0.0,max-min]
randVal = minRandValue[c] + randVal; // number in [min,max]
retLambda.set(c, randVal);
} else {
retLambda.set(c, defaultLambda[c]);
}
}
return retLambda;
}
private double[] randomPerturbation(double[] origLambda, int i, double method, double param,
double mult) {
double sigma = 0.0;
if (method == 1) {
sigma = 1.0 / Math.pow(i, param);
} else if (method == 2) {
sigma = Math.exp(-param * i);
} else if (method == 3) {
sigma = Math.max(0.0, 1.0 - (i / param));
}
sigma = mult * sigma;
double[] retLambda = new double[1 + numParams];
for (int c = 1; c <= numParams; ++c) {
if (isOptimizable[c]) {
double randVal = 2 * randGen.nextDouble() - 1.0; // number in [-1.0,1.0]
++generatedRands;
randVal = randVal * sigma; // number in [-sigma,sigma]
randVal = randVal * origLambda[c]; // number in [-sigma*orig[c],sigma*orig[c]]
randVal = randVal + origLambda[c]; // number in
// [orig[c]-sigma*orig[c],orig[c]+sigma*orig[c]]
// = [orig[c]*(1-sigma),orig[c]*(1+sigma)]
retLambda[c] = randVal;
} else {
retLambda[c] = origLambda[c];
}
}
return retLambda;
}
@SuppressWarnings("unused")
private HashSet<Integer> indicesToDiscard(double[] slope, double[] offset) {
// some lines can be eliminated: the ones that have a lower offset
// than some other line with the same slope.
// That is, for any k1 and k2:
// if slope[k1] = slope[k2] and offset[k1] > offset[k2],
// then k2 can be eliminated.
// (This is actually important to do as it eliminates a bug.)
// print("discarding: ",4);
int numCandidates = slope.length;
HashSet<Integer> discardedIndices = new HashSet<Integer>();
HashMap<Double, Integer> indicesOfSlopes = new HashMap<Double, Integer>();
// maps slope to index of best candidate that has that slope.
// ("best" as in the one with the highest offset)
for (int k1 = 0; k1 < numCandidates; ++k1) {
double currSlope = slope[k1];
if (!indicesOfSlopes.containsKey(currSlope)) {
indicesOfSlopes.put(currSlope, k1);
} else {
int existingIndex = indicesOfSlopes.get(currSlope);
if (offset[existingIndex] > offset[k1]) {
discardedIndices.add(k1);
// print(k1 + " ",4);
} else if (offset[k1] > offset[existingIndex]) {
indicesOfSlopes.put(currSlope, k1);
discardedIndices.add(existingIndex);
// print(existingIndex + " ",4);
}
}
}
// old way of doing it; takes quadratic time (vs. linear time above)
/*
* for (int k1 = 0; k1 < numCandidates; ++k1) { for (int k2 = 0; k2 < numCandidates; ++k2) { if
* (k1 != k2 && slope[k1] == slope[k2] && offset[k1] > offset[k2]) { discardedIndices.add(k2);
* // print(k2 + " ",4); } } }
*/
// println("",4);
return discardedIndices;
} // indicesToDiscard(double[] slope, double[] offset)
}