blob: 70d72bbf0959e0478d2fb3ede2db609f34b36da6 [file] [log] [blame]
package joshua.decoder;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.logging.Logger;
import joshua.util.Regex;
import joshua.util.io.LineReader;
/**
* Configuration file for Joshua decoder.
* <p>
* When adding new features to Joshua, any new configurable parameters should be added to this
* class.
*
* @author Zhifei Li, <zhifei.work@gmail.com>
* @author Matt Post <post@cs.jhu.edu>
*/
public class JoshuaConfiguration {
// lm config
// new format enabling multiple language models
public static ArrayList<String> lms = new ArrayList<String>();
// new format enabling any number of grammar files
public static ArrayList<String> tms = new ArrayList<String>();
// old format specifying attributes of a single language model separately
public static String lm_type = "kenlm";
public static double lm_ceiling_cost = 100;
public static boolean use_left_equivalent_state = false;
public static boolean use_right_equivalent_state = false;
public static int lm_order = 3;
public static String lm_file = null;
/*
* The file to read the weights from (part of the sparse features implementation).
*/
public static String weights_file = "";
/*
* The span limit is the maximum span of the input to which rules from the main translation
* grammar can be applied. It does not apply to the glue grammar.
*/
public static int span_limit = 20;
/*
* This word is in an index into a grammars feature sets. The name here ties together the features
* present on each grammar line in a grammar file, and the features present in the Joshua
* configuration file. This allows you to have different sets of features (or shared) across
* grammar files.
*/
public static String phrase_owner = "pt";
public static String glue_owner = "glue";
// Default symbols. The symbol here should be enclosed in square brackets.
public static String default_non_terminal = "[X]";
public static String goal_symbol = "[GOAL]";
public static boolean dense_features = true;
/* If false, sorting of the complete grammar is done at load time. If true, grammar tries are not
* sorted till they are first accessed. */
public static boolean amortized_sorting = true;
public static String tm_file = null;
public static String tm_format = "thrax";
// TODO: support multiple glue grammars
public static String glue_file = null;
public static String glue_format = "thrax";
// syntax-constrained decoding
public static boolean constrain_parse = false;
public static boolean use_pos_labels = false;
// oov-specific
public static boolean true_oovs_only = false;
/* Sentence-level filtering. */
public static boolean filter_grammar = false;
// pruning config
// Cube pruning is always on, with a span-level pop limit of 100.
// Beam and threshold pruning can be enabled, which also changes
// the nature of cube pruning so that the pop limit is no longer
// used. If both are turned off, exhaustive pruning takes effect.
public static int pop_limit = 100;
public static boolean useCubePrune = true;
public static boolean useBeamAndThresholdPrune = false;
public static double fuzz1 = 0.1;
public static double fuzz2 = 0.1;
public static int max_n_items = 30;
public static double relative_threshold = 10.0;
public static int max_n_rules = 50;
/* Maximum sentence length */
public static int maxlen = 200;
/*
* N-best configuration.
*/
// make sure output strings are unique
public static boolean use_unique_nbest = false;
// include the phrasal alignments in the output
public static boolean include_align_index = false;
// The number of hypotheses to output by default
public static int topN = 1;
/*
* This string describes the format of each line of output from the decoder (i.e., the
* translations). The string can include arbitrary text and also variables. The following variables
* are available:
*
* %i the 0-index sentence number
* %s the translated sentence
* %S the translated sentence, denormalized
* %t the synchronous derivation
* %f the list of feature values (as name=value pairs)
* %c the model cost
* %w the weight vector
* %a the alignments between source and target words (currently unimplemented)
*/
public static String outputFormat = "%i ||| %s ||| %f ||| %c";
public static boolean escape_trees = false;
public static int num_parallel_decoders = 1; // number of threads should run
// disk hg
public static String hypergraphFilePattern = "";
// hypergraph visualization
public static boolean visualize_hypergraph = false;
// use google linear corpus gain?
public static boolean useGoogleLinearCorpusGain = false;
public static double[] linearCorpusGainThetas = null;
public static boolean mark_oovs = true;
// used to extract oracle hypotheses from the forest
public static String oracleFile = null;
public static boolean parse = false; // perform synchronous parsing
private static final Logger logger = Logger.getLogger(JoshuaConfiguration.class.getName());
public static ArrayList<String> features = new ArrayList<String>();
/* If set, Joshua will start a (multi-threaded, per "threads") TCP/IP server on this port. */
public static int server_port = 0;
// ===============================================================
// Methods
// ===============================================================
/**
* To process command-line options, we write them to a file that looks like the config file, and
* then call readConfigFile() on it. It would be more general to define a class that sits on a
* stream and knows how to chop it up, but this was quicker to implement.
*/
public static void processCommandLineOptions(String[] options) {
try {
File tmpFile = File.createTempFile("options", null, null);
PrintWriter out = new PrintWriter(new FileWriter(tmpFile));
for (int i = 0; i < options.length; i++) {
String key = options[i].substring(1);
if (i + 1 == options.length || options[i + 1].startsWith("-")) {
// if this is the last item, or if the next item
// is another flag, then this is an argument-less
// flag
out.println(key + "=true");
} else {
out.println(key + "=" + options[i + 1]);
// skip the next item
i++;
}
}
out.close();
JoshuaConfiguration.readConfigFile(tmpFile.getCanonicalPath());
tmpFile.delete();
} catch (IOException e) {
e.printStackTrace();
System.exit(1);
}
}
// This is static instead of a constructor because all the fields
// are static.
public static void readConfigFile(String configFile) throws IOException {
LineReader configReader = new LineReader(configFile);
try {
for (String line : configReader) {
line = line.trim(); // .toLowerCase();
if (Regex.commentOrEmptyLine.matches(line))
continue;
/*
* There are two kinds of substantive (non-comment, non-blank) lines: parameters and feature
* values. Parameters match the pattern "key = value"; all other substantive lines are
* interpreted as features.
*/
if (line.indexOf("=") != -1) { // parameters; (not feature function)
String[] fds = Regex.equalsWithSpaces.split(line);
if (fds.length != 2) {
logger.severe("* FATAL: bad config file line '" + line + "'");
System.exit(1);
}
String parameter = normalize_key(fds[0]);
// store the line for later processing
if (parameter.equals(normalize_key("lm"))) {
lms.add(fds[1]);
} else if (parameter.equals(normalize_key("tm"))) {
tms.add(fds[1]);
} else if (parameter.equals(normalize_key("dump-hypergraph"))) {
hypergraphFilePattern = fds[1].trim();
logger
.finest(String.format(" hypergraph dump file format: %s", hypergraphFilePattern));
} else if (parameter.equals(normalize_key("lm_file"))) {
lm_file = fds[1].trim();
logger.finest(String.format("lm file: %s", lm_file));
} else if (parameter.equals(normalize_key("parse"))) {
parse = Boolean.parseBoolean(fds[1]);
logger.finest(String.format("parse: %s", parse));
} else if (parameter.equals(normalize_key("tm_file"))) {
tm_file = fds[1].trim();
logger.finest(String.format("tm file: %s", tm_file));
} else if (parameter.equals(normalize_key("glue_file"))) {
glue_file = fds[1].trim();
logger.finest(String.format("glue file: %s", glue_file));
} else if (parameter.equals(normalize_key("tm_format"))) {
tm_format = fds[1].trim();
logger.finest(String.format("tm format: %s", tm_format));
} else if (parameter.equals(normalize_key("glue_format"))) {
glue_format = fds[1].trim();
logger.finest(String.format("glue format: %s", glue_format));
} else if (parameter.equals(normalize_key("dump-hypergraph"))) {
hypergraphFilePattern = fds[1].trim();
logger
.finest(String.format(" hypergraph dump file format: %s", hypergraphFilePattern));
} else if (parameter.equals(normalize_key("lm_type"))) {
lm_type = String.valueOf(fds[1]);
if (!lm_type.equals("kenlm") && !lm_type.equals("berkeleylm")
&& !lm_type.equals("none") && !lm_type.equals("javalm")) {
System.err.println("* FATAL: lm_type '" + lm_type + "' not supported");
System.err
.println("* supported types are 'kenlm' (default), 'berkeleylm', and 'javalm' (not recommended), and 'none'");
System.exit(1);
}
} else if (parameter.equals(normalize_key("lm_ceiling_cost"))) {
lm_ceiling_cost = Double.parseDouble(fds[1]);
logger.finest(String.format("lm_ceiling_cost: %s", lm_ceiling_cost));
} else if (parameter.equals(normalize_key("use_left_equivalent_state"))) {
use_left_equivalent_state = Boolean.valueOf(fds[1]);
logger
.finest(String.format("use_left_equivalent_state: %s", use_left_equivalent_state));
} else if (parameter.equals(normalize_key("use_right_equivalent_state"))) {
use_right_equivalent_state = Boolean.valueOf(fds[1]);
logger.finest(String.format("use_right_equivalent_state: %s",
use_right_equivalent_state));
} else if (parameter.equals(normalize_key("order"))) {
lm_order = Integer.parseInt(fds[1]);
logger.finest(String.format("g_lm_order: %s", lm_order));
} else if (parameter.equals(normalize_key("span_limit"))) {
span_limit = Integer.parseInt(fds[1]);
logger.finest(String.format("span_limit: %s", span_limit));
} else if (parameter.equals(normalize_key("phrase_owner"))) {
phrase_owner = fds[1].trim();
logger.finest(String.format("phrase_owner: %s", phrase_owner));
} else if (parameter.equals(normalize_key("glue_owner"))) {
glue_owner = fds[1].trim();
logger.finest(String.format("glue_owner: %s", glue_owner));
} else if (parameter.equals(normalize_key("default_non_terminal"))) {
default_non_terminal = "[" + fds[1].trim() + "]";
// default_non_terminal = fds[1].trim();
logger.finest(String.format("default_non_terminal: %s", default_non_terminal));
} else if (parameter.equals(normalize_key("goalSymbol"))) {
goal_symbol = fds[1].trim();
// If the goal symbol was not enclosed in square brackets, then add them
if (!goal_symbol.matches("\\[.*\\]"))
goal_symbol = "[" + goal_symbol + "]";
logger.finest("goalSymbol: " + goal_symbol);
} else if (parameter.equals(normalize_key("weights-file"))) {
weights_file = fds[1];
} else if (parameter.equals(normalize_key("constrain_parse"))) {
constrain_parse = Boolean.parseBoolean(fds[1]);
} else if (parameter.equals(normalize_key("true_oovs_only"))) {
true_oovs_only = Boolean.parseBoolean(fds[1]);
} else if (parameter.equals(normalize_key("filter-grammar"))) {
filter_grammar = Boolean.parseBoolean(fds[1]);
} else if (parameter.equals(normalize_key("amortize"))) {
amortized_sorting = Boolean.parseBoolean(fds[1]);
} else if (parameter.equals(normalize_key("use_pos_labels"))) {
use_pos_labels = Boolean.parseBoolean(fds[1]);
} else if (parameter.equals(normalize_key("fuzz1"))) {
fuzz1 = Double.parseDouble(fds[1]);
logger.finest(String.format("fuzz1: %s", fuzz1));
} else if (parameter.equals(normalize_key("fuzz2"))) {
fuzz2 = Double.parseDouble(fds[1]);
logger.finest(String.format("fuzz2: %s", fuzz2));
} else if (parameter.equals(normalize_key("max_n_items"))) {
max_n_items = Integer.parseInt(fds[1]);
logger.finest(String.format("max_n_items: %s", max_n_items));
} else if (parameter.equals(normalize_key("relative_threshold"))) {
relative_threshold = Double.parseDouble(fds[1]);
logger.finest(String.format("relative_threshold: %s", relative_threshold));
} else if (parameter.equals(normalize_key("max_n_rules"))) {
max_n_rules = Integer.parseInt(fds[1]);
logger.finest(String.format("max_n_rules: %s", max_n_rules));
} else if (parameter.equals(normalize_key("use_unique_nbest"))) {
use_unique_nbest = Boolean.valueOf(fds[1]);
logger.finest(String.format("use_unique_nbest: %s", use_unique_nbest));
} else if (parameter.equals(normalize_key("output-format"))) {
outputFormat = fds[1];
logger.finest(String.format("output-format: %s", outputFormat));
} else if (parameter.equals(normalize_key("escape_trees"))) {
escape_trees = Boolean.valueOf(fds[1]);
logger.finest(String.format("escape_trees: %s", escape_trees));
} else if (parameter.equals(normalize_key("include_align_index"))) {
include_align_index = Boolean.valueOf(fds[1]);
logger.finest(String.format("include_align_index: %s", include_align_index));
} else if (parameter.equals(normalize_key("top_n"))) {
topN = Integer.parseInt(fds[1]);
logger.finest(String.format("topN: %s", topN));
} else if (parameter.equals(normalize_key("num_parallel_decoders"))
|| parameter.equals(normalize_key("threads"))) {
num_parallel_decoders = Integer.parseInt(fds[1]);
if (num_parallel_decoders <= 0) {
throw new IllegalArgumentException(
"Must specify a positive number for num_parallel_decoders");
}
logger.finest(String.format("num_parallel_decoders: %s", num_parallel_decoders));
} else if (parameter.equals(normalize_key("visualize_hypergraph"))) {
visualize_hypergraph = Boolean.valueOf(fds[1]);
logger.finest(String.format("visualize_hypergraph: %s", visualize_hypergraph));
} else if (parameter.equals(normalize_key("mark_oovs"))) {
mark_oovs = Boolean.valueOf(fds[1]);
logger.finest(String.format("mark_oovs: %s", mark_oovs));
} else if (parameter.equals(normalize_key("pop-limit"))) {
pop_limit = Integer.valueOf(fds[1]);
logger.finest(String.format("pop-limit: %s", pop_limit));
} else if (parameter.equals(normalize_key("useCubePrune"))) {
useCubePrune = Boolean.valueOf(fds[1]);
if (useCubePrune == false)
logger.warning("useCubePrune=false");
logger.finest(String.format("useCubePrune: %s", useCubePrune));
} else if (parameter.equals(normalize_key("useBeamAndThresholdPrune"))) {
useBeamAndThresholdPrune = Boolean.valueOf(fds[1]);
if (useBeamAndThresholdPrune == false)
logger.warning("useBeamAndThresholdPrune=false");
logger.finest(String.format("useBeamAndThresholdPrune: %s", useBeamAndThresholdPrune));
} else if (parameter.equals(normalize_key("useGoogleLinearCorpusGain"))) {
useGoogleLinearCorpusGain = new Boolean(fds[1].trim());
logger
.finest(String.format("useGoogleLinearCorpusGain: %s", useGoogleLinearCorpusGain));
} else if (parameter.equals(normalize_key("googleBLEUWeights"))) {
String[] googleWeights = fds[1].trim().split(";");
if (googleWeights.length != 5) {
logger.severe("wrong line=" + line);
System.exit(1);
}
linearCorpusGainThetas = new double[5];
for (int i = 0; i < 5; i++)
linearCorpusGainThetas[i] = new Double(googleWeights[i]);
logger.finest(String.format("googleBLEUWeights: %s", linearCorpusGainThetas));
} else if (parameter.equals(normalize_key("oracleFile"))) {
oracleFile = fds[1].trim();
logger.info(String.format(" oracle file: %s", oracleFile));
if (!new File(oracleFile).exists()) {
logger.warning("FATAL: can't find oracle file '" + oracleFile + "'");
System.exit(1);
}
} else if (parameter.equals(normalize_key("server-port"))) {
server_port = Integer.parseInt(fds[1]);
logger.info(String.format(" server-port: %d", server_port));
} else if (parameter.equals("c") || parameter.equals("config")) {
// this was used to send in the config file, just ignore it
;
} else if (parameter.equals(normalize_key("feature-function"))) {
// add the feature to the list of features for later processing
features.add("feature_function = " + fds[1]);
} else if (parameter.equals(normalize_key("maxlen"))) {
// add the feature to the list of features for later processing
maxlen = Integer.parseInt(fds[1]);
} else {
if (parameter.equals(normalize_key("use-sent-specific-tm"))
|| parameter.equals(normalize_key("add-combined-cost"))
|| parameter.equals(normalize_key("use-tree-nbest"))
|| parameter.equals(normalize_key("use-kenlm"))
|| parameter.equals(normalize_key("regexp-grammar"))) {
logger.warning(String.format("WARNING: ignoring deprecated parameter '%s'", fds[0]));
} else {
logger.warning("FATAL: unknown configuration parameter '" + fds[0] + "'");
System.exit(1);
}
}
logger.info(String.format(" %s = '%s'", normalize_key(fds[0]), fds[1]));
} else {
// Feature function. These are processed a bit later
// in JoshuaDecoder initialization, so we just set
// them aside for now.
features.add(line);
}
}
} finally {
configReader.close();
}
// This is for backwards compatibility of LM format. If the
// config file did not contain lines of the form "lm = ...",
// then we create one from the handful of separately-specified
// parameters. These combined lines are later processed in
// JoshuaDecoder as part of the multiple LM support
if (lms.size() == 0 && lm_file != null) {
String line = String.format("%s %d %b %b %.2f %s", lm_type, lm_order,
use_left_equivalent_state, use_right_equivalent_state, lm_ceiling_cost, lm_file);
lms.add(line);
}
// Language model orders are particular to each LM, but for
// purposes of state maintenance, we set the global value to
// the maximum of any of the individual models
for (String lmLine : lms) {
String tokens[] = lmLine.split("\\s+");
int order = Integer.parseInt(tokens[1]);
if (order > JoshuaConfiguration.lm_order)
JoshuaConfiguration.lm_order = order;
}
/*
* Now we do a similar thing for the TMs, enabling backward compatibility with the old format
* that allowed for just two grammars. The new format is
*
* tm = FORMAT OWNER SPAN_LIMIT FILE
*/
if (tms.size() == 0 && tm_file != null) {
tms.add(String.format("%s %s %d %s", tm_format, phrase_owner, span_limit, tm_file));
tms.add(String.format("%s %s %d %s", glue_format, glue_owner, -1, glue_file));
}
if (useGoogleLinearCorpusGain) {
if (linearCorpusGainThetas == null) {
logger.info("linearCorpusGainThetas is null, did you set googleBLEUWeights properly?");
System.exit(1);
} else if (linearCorpusGainThetas.length != 5) {
logger
.info("linearCorpusGainThetas does not have five values, did you set googleBLEUWeights properly?");
System.exit(1);
}
}
}
/**
* Checks for invalid variable configurations
*/
public static void sanityCheck() {
if (pop_limit > 0 && useBeamAndThresholdPrune) {
System.err
.println("* FATAL: 'pop-limit' >= 0 is incompatible with 'useBeamAndThresholdPrune'");
System.exit(0);
}
}
/**
* Normalizes parameter names by removing underscores and hyphens and lowercasing. This defines
* equivalence classes on external use of parameter names, permitting arbitrary_under_scores and
* camelCasing in paramter names without forcing the user to memorize them all. Here are some
* examples of equivalent ways to refer to parameter names:
*
* {pop-limit, poplimit, PopLimit, popLimit, pop_lim_it} {lmfile, lm-file, LM-FILE, lm_file}
*/
public static String normalize_key(String text) {
return text.replaceAll("[-_]", "").toLowerCase();
}
}