src/joshua/decoder/JoshuaConfiguration.java - joshua - Git at Google

 package joshua.decoder;

 import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.util.ArrayList;
 import java.util.logging.Logger;

 import joshua.util.Regex;
 import joshua.util.io.LineReader;

 /**
  * Configuration file for Joshua decoder.
  * <p>
  * When adding new features to Joshua, any new configurable parameters should be added to this
  * class.
  *
  * @author Zhifei Li, <zhifei.work@gmail.com>
  * @author Matt Post <post@cs.jhu.edu>
  */
 public class JoshuaConfiguration {
   // lm config
   // new format enabling multiple language models
   public static ArrayList<String> lms = new ArrayList<String>();

   // new format enabling any number of grammar files
   public static ArrayList<String> tms = new ArrayList<String>();

   // old format specifying attributes of a single language model separately
   public static String lm_type = "kenlm";
   public static double lm_ceiling_cost = 100;
   public static boolean use_left_equivalent_state = false;
   public static boolean use_right_equivalent_state = false;
   public static int lm_order = 3;

   public static String lm_file = null;

   /*
    * The file to read the weights from (part of the sparse features implementation).
    */
   public static String weights_file = "";

   /*
    * The span limit is the maximum span of the input to which rules from the main translation
    * grammar can be applied. It does not apply to the glue grammar.
    */
   public static int span_limit = 20;

   /*
    * This word is in an index into a grammars feature sets. The name here ties together the features
    * present on each grammar line in a grammar file, and the features present in the Joshua
    * configuration file. This allows you to have different sets of features (or shared) across
    * grammar files.
    */
   public static String phrase_owner = "pt";
   public static String glue_owner = "glue";

   // Default symbols. The symbol here should be enclosed in square brackets.
   public static String default_non_terminal = "[X]";
   public static String goal_symbol = "[GOAL]";

   public static boolean dense_features = true;

   /* If false, sorting of the complete grammar is done at load time. If true, grammar tries are not
    * sorted till they are first accessed. */
   public static boolean amortized_sorting = true;

   public static String tm_file = null;
   public static String tm_format = "thrax";

   // TODO: support multiple glue grammars
   public static String glue_file = null;
   public static String glue_format = "thrax";

   // syntax-constrained decoding
   public static boolean constrain_parse = false;
   public static boolean use_pos_labels = false;

   // oov-specific
   public static boolean true_oovs_only = false;

   /* Sentence-level filtering. */
   public static boolean filter_grammar = false;

   // pruning config

   // Cube pruning is always on, with a span-level pop limit of 100.
   // Beam and threshold pruning can be enabled, which also changes
   // the nature of cube pruning so that the pop limit is no longer
   // used. If both are turned off, exhaustive pruning takes effect.
   public static int pop_limit = 100;
   public static boolean useCubePrune = true;
   public static boolean useBeamAndThresholdPrune = false;
   public static double fuzz1 = 0.1;
   public static double fuzz2 = 0.1;
   public static int max_n_items = 30;
   public static double relative_threshold = 10.0;
   public static int max_n_rules = 50;

   /* Maximum sentence length */
   public static int maxlen = 200;

   /*
    * N-best configuration.
    */
   // make sure output strings are unique
   public static boolean use_unique_nbest = false;
   // include the phrasal alignments in the output
   public static boolean include_align_index = false;
   // The number of hypotheses to output by default
   public static int topN = 1;

   /*
    * This string describes the format of each line of output from the decoder (i.e., the
    * translations). The string can include arbitrary text and also variables.  The following variables
    * are available:
    *
    *   %i the 0-index sentence number
    *   %s the translated sentence
    *   %S the translated sentence, denormalized
    *   %t the synchronous derivation
    *   %f the list of feature values (as name=value pairs)
    *   %c the model cost
    *   %w the weight vector
    *   %a the alignments between source and target words (currently unimplemented)
    */
   public static String outputFormat = "%i ||| %s ||| %f ||| %c";

   public static boolean escape_trees = false;

   public static int num_parallel_decoders = 1; // number of threads should run

   // disk hg
   public static String hypergraphFilePattern = "";

   // hypergraph visualization
   public static boolean visualize_hypergraph = false;

   // use google linear corpus gain?
   public static boolean useGoogleLinearCorpusGain = false;
   public static double[] linearCorpusGainThetas = null;
   public static boolean mark_oovs = true;

   // used to extract oracle hypotheses from the forest
   public static String oracleFile = null;

   public static boolean parse = false; // perform synchronous parsing

   private static final Logger logger = Logger.getLogger(JoshuaConfiguration.class.getName());

   public static ArrayList<String> features = new ArrayList<String>();

   /* If set, Joshua will start a (multi-threaded, per "threads") TCP/IP server on this port. */
   public static int server_port = 0;

   // ===============================================================
   // Methods
   // ===============================================================

   /**
    * To process command-line options, we write them to a file that looks like the config file, and
    * then call readConfigFile() on it. It would be more general to define a class that sits on a
    * stream and knows how to chop it up, but this was quicker to implement.
    */
   public static void processCommandLineOptions(String[] options) {
     try {
       File tmpFile = File.createTempFile("options", null, null);
       PrintWriter out = new PrintWriter(new FileWriter(tmpFile));

       for (int i = 0; i < options.length; i++) {
         String key = options[i].substring(1);
         if (i + 1 == options.length || options[i + 1].startsWith("-")) {
           // if this is the last item, or if the next item
           // is another flag, then this is an argument-less
           // flag
           out.println(key + "=true");

         } else {
           out.println(key + "=" + options[i + 1]);
           // skip the next item
           i++;
         }
       }
       out.close();
       JoshuaConfiguration.readConfigFile(tmpFile.getCanonicalPath());

       tmpFile.delete();

     } catch (IOException e) {
       e.printStackTrace();
       System.exit(1);
     }
   }

   // This is static instead of a constructor because all the fields
   // are static.
   public static void readConfigFile(String configFile) throws IOException {

     LineReader configReader = new LineReader(configFile);
     try {
       for (String line : configReader) {
         line = line.trim(); // .toLowerCase();

         if (Regex.commentOrEmptyLine.matches(line))
           continue;

         /*
          * There are two kinds of substantive (non-comment, non-blank) lines: parameters and feature
          * values. Parameters match the pattern "key = value"; all other substantive lines are
          * interpreted as features.
          */

         if (line.indexOf("=") != -1) { // parameters; (not feature function)
           String[] fds = Regex.equalsWithSpaces.split(line);
           if (fds.length != 2) {
             logger.severe("* FATAL: bad config file line '" + line + "'");
             System.exit(1);
           }

           String parameter = normalize_key(fds[0]);

           // store the line for later processing
           if (parameter.equals(normalize_key("lm"))) {
             lms.add(fds[1]);

           } else if (parameter.equals(normalize_key("tm"))) {
             tms.add(fds[1]);

           } else if (parameter.equals(normalize_key("dump-hypergraph"))) {
             hypergraphFilePattern = fds[1].trim();
             logger
                 .finest(String.format("  hypergraph dump file format: %s", hypergraphFilePattern));

           } else if (parameter.equals(normalize_key("lm_file"))) {
             lm_file = fds[1].trim();
             logger.finest(String.format("lm file: %s", lm_file));
           } else if (parameter.equals(normalize_key("parse"))) {
             parse = Boolean.parseBoolean(fds[1]);
             logger.finest(String.format("parse: %s", parse));

           } else if (parameter.equals(normalize_key("tm_file"))) {
             tm_file = fds[1].trim();
             logger.finest(String.format("tm file: %s", tm_file));

           } else if (parameter.equals(normalize_key("glue_file"))) {
             glue_file = fds[1].trim();
             logger.finest(String.format("glue file: %s", glue_file));

           } else if (parameter.equals(normalize_key("tm_format"))) {
             tm_format = fds[1].trim();
             logger.finest(String.format("tm format: %s", tm_format));

           } else if (parameter.equals(normalize_key("glue_format"))) {
             glue_format = fds[1].trim();
             logger.finest(String.format("glue format: %s", glue_format));

           } else if (parameter.equals(normalize_key("dump-hypergraph"))) {
             hypergraphFilePattern = fds[1].trim();
             logger
                 .finest(String.format("  hypergraph dump file format: %s", hypergraphFilePattern));

           } else if (parameter.equals(normalize_key("lm_type"))) {
             lm_type = String.valueOf(fds[1]);
             if (!lm_type.equals("kenlm") && !lm_type.equals("berkeleylm")
                 && !lm_type.equals("none") && !lm_type.equals("javalm")) {
               System.err.println("* FATAL: lm_type '" + lm_type + "' not supported");
               System.err
                   .println("* supported types are 'kenlm' (default), 'berkeleylm', and 'javalm' (not recommended), and 'none'");
               System.exit(1);
             }

           } else if (parameter.equals(normalize_key("lm_ceiling_cost"))) {
             lm_ceiling_cost = Double.parseDouble(fds[1]);
             logger.finest(String.format("lm_ceiling_cost: %s", lm_ceiling_cost));

           } else if (parameter.equals(normalize_key("use_left_equivalent_state"))) {
             use_left_equivalent_state = Boolean.valueOf(fds[1]);
             logger
                 .finest(String.format("use_left_equivalent_state: %s", use_left_equivalent_state));

           } else if (parameter.equals(normalize_key("use_right_equivalent_state"))) {
             use_right_equivalent_state = Boolean.valueOf(fds[1]);
             logger.finest(String.format("use_right_equivalent_state: %s",
                 use_right_equivalent_state));

           } else if (parameter.equals(normalize_key("order"))) {
             lm_order = Integer.parseInt(fds[1]);
             logger.finest(String.format("g_lm_order: %s", lm_order));

           } else if (parameter.equals(normalize_key("span_limit"))) {
             span_limit = Integer.parseInt(fds[1]);
             logger.finest(String.format("span_limit: %s", span_limit));

           } else if (parameter.equals(normalize_key("phrase_owner"))) {
             phrase_owner = fds[1].trim();
             logger.finest(String.format("phrase_owner: %s", phrase_owner));

           } else if (parameter.equals(normalize_key("glue_owner"))) {
             glue_owner = fds[1].trim();
             logger.finest(String.format("glue_owner: %s", glue_owner));

           } else if (parameter.equals(normalize_key("default_non_terminal"))) {
             default_non_terminal = "[" + fds[1].trim() + "]";
             // default_non_terminal = fds[1].trim();
             logger.finest(String.format("default_non_terminal: %s", default_non_terminal));

           } else if (parameter.equals(normalize_key("goalSymbol"))) {
             goal_symbol = fds[1].trim();

             // If the goal symbol was not enclosed in square brackets, then add them
             if (!goal_symbol.matches("\\[.*\\]"))
               goal_symbol = "[" + goal_symbol + "]";

             logger.finest("goalSymbol: " + goal_symbol);

           } else if (parameter.equals(normalize_key("weights-file"))) {
             weights_file = fds[1];

           } else if (parameter.equals(normalize_key("constrain_parse"))) {
             constrain_parse = Boolean.parseBoolean(fds[1]);

           } else if (parameter.equals(normalize_key("true_oovs_only"))) {
             true_oovs_only = Boolean.parseBoolean(fds[1]);

           } else if (parameter.equals(normalize_key("filter-grammar"))) {
             filter_grammar = Boolean.parseBoolean(fds[1]);

           } else if (parameter.equals(normalize_key("amortize"))) {
             amortized_sorting = Boolean.parseBoolean(fds[1]);

           } else if (parameter.equals(normalize_key("use_pos_labels"))) {
             use_pos_labels = Boolean.parseBoolean(fds[1]);

           } else if (parameter.equals(normalize_key("fuzz1"))) {
             fuzz1 = Double.parseDouble(fds[1]);
             logger.finest(String.format("fuzz1: %s", fuzz1));

           } else if (parameter.equals(normalize_key("fuzz2"))) {
             fuzz2 = Double.parseDouble(fds[1]);
             logger.finest(String.format("fuzz2: %s", fuzz2));

           } else if (parameter.equals(normalize_key("max_n_items"))) {
             max_n_items = Integer.parseInt(fds[1]);
             logger.finest(String.format("max_n_items: %s", max_n_items));

           } else if (parameter.equals(normalize_key("relative_threshold"))) {
             relative_threshold = Double.parseDouble(fds[1]);
             logger.finest(String.format("relative_threshold: %s", relative_threshold));

           } else if (parameter.equals(normalize_key("max_n_rules"))) {
             max_n_rules = Integer.parseInt(fds[1]);
             logger.finest(String.format("max_n_rules: %s", max_n_rules));

           } else if (parameter.equals(normalize_key("use_unique_nbest"))) {
             use_unique_nbest = Boolean.valueOf(fds[1]);
             logger.finest(String.format("use_unique_nbest: %s", use_unique_nbest));

           } else if (parameter.equals(normalize_key("output-format"))) {
             outputFormat = fds[1];
             logger.finest(String.format("output-format: %s", outputFormat));

           } else if (parameter.equals(normalize_key("escape_trees"))) {
             escape_trees = Boolean.valueOf(fds[1]);
             logger.finest(String.format("escape_trees: %s", escape_trees));

           } else if (parameter.equals(normalize_key("include_align_index"))) {
             include_align_index = Boolean.valueOf(fds[1]);
             logger.finest(String.format("include_align_index: %s", include_align_index));

           } else if (parameter.equals(normalize_key("top_n"))) {
             topN = Integer.parseInt(fds[1]);
             logger.finest(String.format("topN: %s", topN));

           } else if (parameter.equals(normalize_key("num_parallel_decoders"))
               || parameter.equals(normalize_key("threads"))) {
             num_parallel_decoders = Integer.parseInt(fds[1]);
             if (num_parallel_decoders <= 0) {
               throw new IllegalArgumentException(
                   "Must specify a positive number for num_parallel_decoders");
             }
             logger.finest(String.format("num_parallel_decoders: %s", num_parallel_decoders));

           } else if (parameter.equals(normalize_key("visualize_hypergraph"))) {
             visualize_hypergraph = Boolean.valueOf(fds[1]);
             logger.finest(String.format("visualize_hypergraph: %s", visualize_hypergraph));

           } else if (parameter.equals(normalize_key("mark_oovs"))) {
             mark_oovs = Boolean.valueOf(fds[1]);
             logger.finest(String.format("mark_oovs: %s", mark_oovs));

           } else if (parameter.equals(normalize_key("pop-limit"))) {
             pop_limit = Integer.valueOf(fds[1]);
             logger.finest(String.format("pop-limit: %s", pop_limit));

           } else if (parameter.equals(normalize_key("useCubePrune"))) {
             useCubePrune = Boolean.valueOf(fds[1]);
             if (useCubePrune == false)
               logger.warning("useCubePrune=false");
             logger.finest(String.format("useCubePrune: %s", useCubePrune));

           } else if (parameter.equals(normalize_key("useBeamAndThresholdPrune"))) {
             useBeamAndThresholdPrune = Boolean.valueOf(fds[1]);
             if (useBeamAndThresholdPrune == false)
               logger.warning("useBeamAndThresholdPrune=false");
             logger.finest(String.format("useBeamAndThresholdPrune: %s", useBeamAndThresholdPrune));

           } else if (parameter.equals(normalize_key("useGoogleLinearCorpusGain"))) {
             useGoogleLinearCorpusGain = new Boolean(fds[1].trim());
             logger
                 .finest(String.format("useGoogleLinearCorpusGain: %s", useGoogleLinearCorpusGain));

           } else if (parameter.equals(normalize_key("googleBLEUWeights"))) {
             String[] googleWeights = fds[1].trim().split(";");
             if (googleWeights.length != 5) {
               logger.severe("wrong line=" + line);
               System.exit(1);
             }
             linearCorpusGainThetas = new double[5];
             for (int i = 0; i < 5; i++)
               linearCorpusGainThetas[i] = new Double(googleWeights[i]);

             logger.finest(String.format("googleBLEUWeights: %s", linearCorpusGainThetas));

           } else if (parameter.equals(normalize_key("oracleFile"))) {
             oracleFile = fds[1].trim();
             logger.info(String.format("    oracle file: %s", oracleFile));
             if (!new File(oracleFile).exists()) {
               logger.warning("FATAL: can't find oracle file '" + oracleFile + "'");
               System.exit(1);
             }

           } else if (parameter.equals(normalize_key("server-port"))) {
             server_port = Integer.parseInt(fds[1]);
             logger.info(String.format("    server-port: %d", server_port));

           } else if (parameter.equals("c") || parameter.equals("config")) {
             // this was used to send in the config file, just ignore it
             ;

           } else if (parameter.equals(normalize_key("feature-function"))) {
             // add the feature to the list of features for later processing
             features.add("feature_function = " + fds[1]);

           } else if (parameter.equals(normalize_key("maxlen"))) {
             // add the feature to the list of features for later processing
             maxlen = Integer.parseInt(fds[1]);

           } else {

             if (parameter.equals(normalize_key("use-sent-specific-tm"))
                 || parameter.equals(normalize_key("add-combined-cost"))
                 || parameter.equals(normalize_key("use-tree-nbest"))
                 || parameter.equals(normalize_key("use-kenlm"))
                 || parameter.equals(normalize_key("regexp-grammar"))) {
               logger.warning(String.format("WARNING: ignoring deprecated parameter '%s'", fds[0]));

             } else {
               logger.warning("FATAL: unknown configuration parameter '" + fds[0] + "'");
               System.exit(1);
             }
           }

           logger.info(String.format("    %s = '%s'", normalize_key(fds[0]), fds[1]));

         } else {
           // Feature function. These are processed a bit later
           // in JoshuaDecoder initialization, so we just set
           // them aside for now.

           features.add(line);
         }
       }
     } finally {
       configReader.close();
     }

     // This is for backwards compatibility of LM format. If the
     // config file did not contain lines of the form "lm = ...",
     // then we create one from the handful of separately-specified
     // parameters. These combined lines are later processed in
     // JoshuaDecoder as part of the multiple LM support
     if (lms.size() == 0 && lm_file != null) {
       String line = String.format("%s %d %b %b %.2f %s", lm_type, lm_order,
           use_left_equivalent_state, use_right_equivalent_state, lm_ceiling_cost, lm_file);
       lms.add(line);
     }

     // Language model orders are particular to each LM, but for
     // purposes of state maintenance, we set the global value to
     // the maximum of any of the individual models
     for (String lmLine : lms) {
       String tokens[] = lmLine.split("\\s+");
       int order = Integer.parseInt(tokens[1]);
       if (order > JoshuaConfiguration.lm_order)
         JoshuaConfiguration.lm_order = order;
     }

     /*
      * Now we do a similar thing for the TMs, enabling backward compatibility with the old format
      * that allowed for just two grammars. The new format is
      *
      * tm = FORMAT OWNER SPAN_LIMIT FILE
      */
     if (tms.size() == 0 && tm_file != null) {
       tms.add(String.format("%s %s %d %s", tm_format, phrase_owner, span_limit, tm_file));
       tms.add(String.format("%s %s %d %s", glue_format, glue_owner, -1, glue_file));
     }

     if (useGoogleLinearCorpusGain) {
       if (linearCorpusGainThetas == null) {
         logger.info("linearCorpusGainThetas is null, did you set googleBLEUWeights properly?");
         System.exit(1);
       } else if (linearCorpusGainThetas.length != 5) {
         logger
             .info("linearCorpusGainThetas does not have five values, did you set googleBLEUWeights properly?");
         System.exit(1);
       }
     }
   }

   /**
    * Checks for invalid variable configurations
    */
   public static void sanityCheck() {
     if (pop_limit > 0 && useBeamAndThresholdPrune) {
       System.err
           .println("* FATAL: 'pop-limit' >= 0 is incompatible with 'useBeamAndThresholdPrune'");
       System.exit(0);
     }
   }

   /**
    * Normalizes parameter names by removing underscores and hyphens and lowercasing. This defines
    * equivalence classes on external use of parameter names, permitting arbitrary_under_scores and
    * camelCasing in paramter names without forcing the user to memorize them all. Here are some
    * examples of equivalent ways to refer to parameter names:
    *
    * {pop-limit, poplimit, PopLimit, popLimit, pop_lim_it} {lmfile, lm-file, LM-FILE, lm_file}
    */
   public static String normalize_key(String text) {
     return text.replaceAll("[-_]", "").toLowerCase();
   }
 }
	package joshua.decoder;

	import java.io.File;
	import java.io.FileWriter;
	import java.io.IOException;
	import java.io.PrintWriter;
	import java.util.ArrayList;
	import java.util.logging.Logger;

	import joshua.util.Regex;
	import joshua.util.io.LineReader;

	/**
	* Configuration file for Joshua decoder.
	* <p>
	* When adding new features to Joshua, any new configurable parameters should be added to this
	* class.
	*
	* @author Zhifei Li, <zhifei.work@gmail.com>
	* @author Matt Post <post@cs.jhu.edu>
	*/
	public class JoshuaConfiguration {
	// lm config
	// new format enabling multiple language models
	public static ArrayList<String> lms = new ArrayList<String>();

	// new format enabling any number of grammar files
	public static ArrayList<String> tms = new ArrayList<String>();

	// old format specifying attributes of a single language model separately
	public static String lm_type = "kenlm";
	public static double lm_ceiling_cost = 100;
	public static boolean use_left_equivalent_state = false;
	public static boolean use_right_equivalent_state = false;
	public static int lm_order = 3;

	public static String lm_file = null;

	/*
	* The file to read the weights from (part of the sparse features implementation).
	*/
	public static String weights_file = "";

	/*
	* The span limit is the maximum span of the input to which rules from the main translation
	* grammar can be applied. It does not apply to the glue grammar.
	*/
	public static int span_limit = 20;

	/*
	* This word is in an index into a grammars feature sets. The name here ties together the features
	* present on each grammar line in a grammar file, and the features present in the Joshua
	* configuration file. This allows you to have different sets of features (or shared) across
	* grammar files.
	*/
	public static String phrase_owner = "pt";
	public static String glue_owner = "glue";

	// Default symbols. The symbol here should be enclosed in square brackets.
	public static String default_non_terminal = "[X]";
	public static String goal_symbol = "[GOAL]";

	public static boolean dense_features = true;

	/* If false, sorting of the complete grammar is done at load time. If true, grammar tries are not
	* sorted till they are first accessed. */
	public static boolean amortized_sorting = true;

	public static String tm_file = null;
	public static String tm_format = "thrax";

	// TODO: support multiple glue grammars
	public static String glue_file = null;
	public static String glue_format = "thrax";

	// syntax-constrained decoding
	public static boolean constrain_parse = false;
	public static boolean use_pos_labels = false;

	// oov-specific
	public static boolean true_oovs_only = false;

	/* Sentence-level filtering. */
	public static boolean filter_grammar = false;

	// pruning config

	// Cube pruning is always on, with a span-level pop limit of 100.
	// Beam and threshold pruning can be enabled, which also changes
	// the nature of cube pruning so that the pop limit is no longer
	// used. If both are turned off, exhaustive pruning takes effect.
	public static int pop_limit = 100;
	public static boolean useCubePrune = true;
	public static boolean useBeamAndThresholdPrune = false;
	public static double fuzz1 = 0.1;
	public static double fuzz2 = 0.1;
	public static int max_n_items = 30;
	public static double relative_threshold = 10.0;
	public static int max_n_rules = 50;

	/* Maximum sentence length */
	public static int maxlen = 200;

	/*
	* N-best configuration.
	*/
	// make sure output strings are unique
	public static boolean use_unique_nbest = false;
	// include the phrasal alignments in the output
	public static boolean include_align_index = false;
	// The number of hypotheses to output by default
	public static int topN = 1;

	/*
	* This string describes the format of each line of output from the decoder (i.e., the
	* translations). The string can include arbitrary text and also variables. The following variables
	* are available:
	*
	* %i the 0-index sentence number
	* %s the translated sentence
	* %S the translated sentence, denormalized
	* %t the synchronous derivation
	* %f the list of feature values (as name=value pairs)
	* %c the model cost
	* %w the weight vector
	* %a the alignments between source and target words (currently unimplemented)
	*/
	public static String outputFormat = "%i \|\|\| %s \|\|\| %f \|\|\| %c";

	public static boolean escape_trees = false;

	public static int num_parallel_decoders = 1; // number of threads should run

	// disk hg
	public static String hypergraphFilePattern = "";

	// hypergraph visualization
	public static boolean visualize_hypergraph = false;

	// use google linear corpus gain?
	public static boolean useGoogleLinearCorpusGain = false;
	public static double[] linearCorpusGainThetas = null;
	public static boolean mark_oovs = true;

	// used to extract oracle hypotheses from the forest
	public static String oracleFile = null;

	public static boolean parse = false; // perform synchronous parsing

	private static final Logger logger = Logger.getLogger(JoshuaConfiguration.class.getName());

	public static ArrayList<String> features = new ArrayList<String>();

	/* If set, Joshua will start a (multi-threaded, per "threads") TCP/IP server on this port. */
	public static int server_port = 0;

	// ===============================================================
	// Methods
	// ===============================================================

	/**
	* To process command-line options, we write them to a file that looks like the config file, and
	* then call readConfigFile() on it. It would be more general to define a class that sits on a
	* stream and knows how to chop it up, but this was quicker to implement.
	*/
	public static void processCommandLineOptions(String[] options) {
	try {
	File tmpFile = File.createTempFile("options", null, null);
	PrintWriter out = new PrintWriter(new FileWriter(tmpFile));

	for (int i = 0; i < options.length; i++) {
	String key = options[i].substring(1);
	if (i + 1 == options.length \|\| options[i + 1].startsWith("-")) {
	// if this is the last item, or if the next item
	// is another flag, then this is an argument-less
	// flag
	out.println(key + "=true");

	} else {
	out.println(key + "=" + options[i + 1]);
	// skip the next item
	i++;
	}
	}
	out.close();
	JoshuaConfiguration.readConfigFile(tmpFile.getCanonicalPath());

	tmpFile.delete();

	} catch (IOException e) {
	e.printStackTrace();
	System.exit(1);
	}
	}

	// This is static instead of a constructor because all the fields
	// are static.
	public static void readConfigFile(String configFile) throws IOException {

	LineReader configReader = new LineReader(configFile);
	try {
	for (String line : configReader) {
	line = line.trim(); // .toLowerCase();

	if (Regex.commentOrEmptyLine.matches(line))
	continue;

	/*
	* There are two kinds of substantive (non-comment, non-blank) lines: parameters and feature
	* values. Parameters match the pattern "key = value"; all other substantive lines are
	* interpreted as features.
	*/

	if (line.indexOf("=") != -1) { // parameters; (not feature function)
	String[] fds = Regex.equalsWithSpaces.split(line);
	if (fds.length != 2) {
	logger.severe("* FATAL: bad config file line '" + line + "'");
	System.exit(1);
	}

	String parameter = normalize_key(fds[0]);

	// store the line for later processing
	if (parameter.equals(normalize_key("lm"))) {
	lms.add(fds[1]);

	} else if (parameter.equals(normalize_key("tm"))) {
	tms.add(fds[1]);

	} else if (parameter.equals(normalize_key("dump-hypergraph"))) {
	hypergraphFilePattern = fds[1].trim();
	logger
	.finest(String.format(" hypergraph dump file format: %s", hypergraphFilePattern));

	} else if (parameter.equals(normalize_key("lm_file"))) {
	lm_file = fds[1].trim();
	logger.finest(String.format("lm file: %s", lm_file));
	} else if (parameter.equals(normalize_key("parse"))) {
	parse = Boolean.parseBoolean(fds[1]);
	logger.finest(String.format("parse: %s", parse));

	} else if (parameter.equals(normalize_key("tm_file"))) {
	tm_file = fds[1].trim();
	logger.finest(String.format("tm file: %s", tm_file));

	} else if (parameter.equals(normalize_key("glue_file"))) {
	glue_file = fds[1].trim();
	logger.finest(String.format("glue file: %s", glue_file));

	} else if (parameter.equals(normalize_key("tm_format"))) {
	tm_format = fds[1].trim();
	logger.finest(String.format("tm format: %s", tm_format));

	} else if (parameter.equals(normalize_key("glue_format"))) {
	glue_format = fds[1].trim();
	logger.finest(String.format("glue format: %s", glue_format));

	} else if (parameter.equals(normalize_key("dump-hypergraph"))) {
	hypergraphFilePattern = fds[1].trim();
	logger
	.finest(String.format(" hypergraph dump file format: %s", hypergraphFilePattern));

	} else if (parameter.equals(normalize_key("lm_type"))) {
	lm_type = String.valueOf(fds[1]);
	if (!lm_type.equals("kenlm") && !lm_type.equals("berkeleylm")
	&& !lm_type.equals("none") && !lm_type.equals("javalm")) {
	System.err.println("* FATAL: lm_type '" + lm_type + "' not supported");
	System.err
	.println("* supported types are 'kenlm' (default), 'berkeleylm', and 'javalm' (not recommended), and 'none'");
	System.exit(1);
	}

	} else if (parameter.equals(normalize_key("lm_ceiling_cost"))) {
	lm_ceiling_cost = Double.parseDouble(fds[1]);
	logger.finest(String.format("lm_ceiling_cost: %s", lm_ceiling_cost));

	} else if (parameter.equals(normalize_key("use_left_equivalent_state"))) {
	use_left_equivalent_state = Boolean.valueOf(fds[1]);
	logger
	.finest(String.format("use_left_equivalent_state: %s", use_left_equivalent_state));

	} else if (parameter.equals(normalize_key("use_right_equivalent_state"))) {
	use_right_equivalent_state = Boolean.valueOf(fds[1]);
	logger.finest(String.format("use_right_equivalent_state: %s",
	use_right_equivalent_state));

	} else if (parameter.equals(normalize_key("order"))) {
	lm_order = Integer.parseInt(fds[1]);
	logger.finest(String.format("g_lm_order: %s", lm_order));

	} else if (parameter.equals(normalize_key("span_limit"))) {
	span_limit = Integer.parseInt(fds[1]);
	logger.finest(String.format("span_limit: %s", span_limit));

	} else if (parameter.equals(normalize_key("phrase_owner"))) {
	phrase_owner = fds[1].trim();
	logger.finest(String.format("phrase_owner: %s", phrase_owner));

	} else if (parameter.equals(normalize_key("glue_owner"))) {
	glue_owner = fds[1].trim();
	logger.finest(String.format("glue_owner: %s", glue_owner));

	} else if (parameter.equals(normalize_key("default_non_terminal"))) {
	default_non_terminal = "[" + fds[1].trim() + "]";
	// default_non_terminal = fds[1].trim();
	logger.finest(String.format("default_non_terminal: %s", default_non_terminal));

	} else if (parameter.equals(normalize_key("goalSymbol"))) {
	goal_symbol = fds[1].trim();

	// If the goal symbol was not enclosed in square brackets, then add them
	if (!goal_symbol.matches("\\[.*\\]"))
	goal_symbol = "[" + goal_symbol + "]";

	logger.finest("goalSymbol: " + goal_symbol);

	} else if (parameter.equals(normalize_key("weights-file"))) {
	weights_file = fds[1];

	} else if (parameter.equals(normalize_key("constrain_parse"))) {
	constrain_parse = Boolean.parseBoolean(fds[1]);

	} else if (parameter.equals(normalize_key("true_oovs_only"))) {
	true_oovs_only = Boolean.parseBoolean(fds[1]);

	} else if (parameter.equals(normalize_key("filter-grammar"))) {
	filter_grammar = Boolean.parseBoolean(fds[1]);

	} else if (parameter.equals(normalize_key("amortize"))) {
	amortized_sorting = Boolean.parseBoolean(fds[1]);

	} else if (parameter.equals(normalize_key("use_pos_labels"))) {
	use_pos_labels = Boolean.parseBoolean(fds[1]);

	} else if (parameter.equals(normalize_key("fuzz1"))) {
	fuzz1 = Double.parseDouble(fds[1]);
	logger.finest(String.format("fuzz1: %s", fuzz1));

	} else if (parameter.equals(normalize_key("fuzz2"))) {
	fuzz2 = Double.parseDouble(fds[1]);
	logger.finest(String.format("fuzz2: %s", fuzz2));

	} else if (parameter.equals(normalize_key("max_n_items"))) {
	max_n_items = Integer.parseInt(fds[1]);
	logger.finest(String.format("max_n_items: %s", max_n_items));

	} else if (parameter.equals(normalize_key("relative_threshold"))) {
	relative_threshold = Double.parseDouble(fds[1]);
	logger.finest(String.format("relative_threshold: %s", relative_threshold));

	} else if (parameter.equals(normalize_key("max_n_rules"))) {
	max_n_rules = Integer.parseInt(fds[1]);
	logger.finest(String.format("max_n_rules: %s", max_n_rules));

	} else if (parameter.equals(normalize_key("use_unique_nbest"))) {
	use_unique_nbest = Boolean.valueOf(fds[1]);
	logger.finest(String.format("use_unique_nbest: %s", use_unique_nbest));

	} else if (parameter.equals(normalize_key("output-format"))) {
	outputFormat = fds[1];
	logger.finest(String.format("output-format: %s", outputFormat));

	} else if (parameter.equals(normalize_key("escape_trees"))) {
	escape_trees = Boolean.valueOf(fds[1]);
	logger.finest(String.format("escape_trees: %s", escape_trees));

	} else if (parameter.equals(normalize_key("include_align_index"))) {
	include_align_index = Boolean.valueOf(fds[1]);
	logger.finest(String.format("include_align_index: %s", include_align_index));

	} else if (parameter.equals(normalize_key("top_n"))) {
	topN = Integer.parseInt(fds[1]);
	logger.finest(String.format("topN: %s", topN));

	} else if (parameter.equals(normalize_key("num_parallel_decoders"))
	\|\| parameter.equals(normalize_key("threads"))) {
	num_parallel_decoders = Integer.parseInt(fds[1]);
	if (num_parallel_decoders <= 0) {
	throw new IllegalArgumentException(
	"Must specify a positive number for num_parallel_decoders");
	}
	logger.finest(String.format("num_parallel_decoders: %s", num_parallel_decoders));

	} else if (parameter.equals(normalize_key("visualize_hypergraph"))) {
	visualize_hypergraph = Boolean.valueOf(fds[1]);
	logger.finest(String.format("visualize_hypergraph: %s", visualize_hypergraph));

	} else if (parameter.equals(normalize_key("mark_oovs"))) {
	mark_oovs = Boolean.valueOf(fds[1]);
	logger.finest(String.format("mark_oovs: %s", mark_oovs));

	} else if (parameter.equals(normalize_key("pop-limit"))) {
	pop_limit = Integer.valueOf(fds[1]);
	logger.finest(String.format("pop-limit: %s", pop_limit));

	} else if (parameter.equals(normalize_key("useCubePrune"))) {
	useCubePrune = Boolean.valueOf(fds[1]);
	if (useCubePrune == false)
	logger.warning("useCubePrune=false");
	logger.finest(String.format("useCubePrune: %s", useCubePrune));

	} else if (parameter.equals(normalize_key("useBeamAndThresholdPrune"))) {
	useBeamAndThresholdPrune = Boolean.valueOf(fds[1]);
	if (useBeamAndThresholdPrune == false)
	logger.warning("useBeamAndThresholdPrune=false");
	logger.finest(String.format("useBeamAndThresholdPrune: %s", useBeamAndThresholdPrune));

	} else if (parameter.equals(normalize_key("useGoogleLinearCorpusGain"))) {
	useGoogleLinearCorpusGain = new Boolean(fds[1].trim());
	logger
	.finest(String.format("useGoogleLinearCorpusGain: %s", useGoogleLinearCorpusGain));

	} else if (parameter.equals(normalize_key("googleBLEUWeights"))) {
	String[] googleWeights = fds[1].trim().split(";");
	if (googleWeights.length != 5) {
	logger.severe("wrong line=" + line);
	System.exit(1);
	}
	linearCorpusGainThetas = new double[5];
	for (int i = 0; i < 5; i++)
	linearCorpusGainThetas[i] = new Double(googleWeights[i]);

	logger.finest(String.format("googleBLEUWeights: %s", linearCorpusGainThetas));

	} else if (parameter.equals(normalize_key("oracleFile"))) {
	oracleFile = fds[1].trim();
	logger.info(String.format(" oracle file: %s", oracleFile));
	if (!new File(oracleFile).exists()) {
	logger.warning("FATAL: can't find oracle file '" + oracleFile + "'");
	System.exit(1);
	}

	} else if (parameter.equals(normalize_key("server-port"))) {
	server_port = Integer.parseInt(fds[1]);
	logger.info(String.format(" server-port: %d", server_port));

	} else if (parameter.equals("c") \|\| parameter.equals("config")) {
	// this was used to send in the config file, just ignore it
	;

	} else if (parameter.equals(normalize_key("feature-function"))) {
	// add the feature to the list of features for later processing
	features.add("feature_function = " + fds[1]);

	} else if (parameter.equals(normalize_key("maxlen"))) {
	// add the feature to the list of features for later processing
	maxlen = Integer.parseInt(fds[1]);

	} else {

	if (parameter.equals(normalize_key("use-sent-specific-tm"))
	\|\| parameter.equals(normalize_key("add-combined-cost"))
	\|\| parameter.equals(normalize_key("use-tree-nbest"))
	\|\| parameter.equals(normalize_key("use-kenlm"))
	\|\| parameter.equals(normalize_key("regexp-grammar"))) {
	logger.warning(String.format("WARNING: ignoring deprecated parameter '%s'", fds[0]));

	} else {
	logger.warning("FATAL: unknown configuration parameter '" + fds[0] + "'");
	System.exit(1);
	}
	}

	logger.info(String.format(" %s = '%s'", normalize_key(fds[0]), fds[1]));

	} else {
	// Feature function. These are processed a bit later
	// in JoshuaDecoder initialization, so we just set
	// them aside for now.

	features.add(line);
	}
	}
	} finally {
	configReader.close();
	}

	// This is for backwards compatibility of LM format. If the
	// config file did not contain lines of the form "lm = ...",
	// then we create one from the handful of separately-specified
	// parameters. These combined lines are later processed in
	// JoshuaDecoder as part of the multiple LM support
	if (lms.size() == 0 && lm_file != null) {
	String line = String.format("%s %d %b %b %.2f %s", lm_type, lm_order,
	use_left_equivalent_state, use_right_equivalent_state, lm_ceiling_cost, lm_file);
	lms.add(line);
	}

	// Language model orders are particular to each LM, but for
	// purposes of state maintenance, we set the global value to
	// the maximum of any of the individual models
	for (String lmLine : lms) {
	String tokens[] = lmLine.split("\\s+");
	int order = Integer.parseInt(tokens[1]);
	if (order > JoshuaConfiguration.lm_order)
	JoshuaConfiguration.lm_order = order;
	}

	/*
	* Now we do a similar thing for the TMs, enabling backward compatibility with the old format
	* that allowed for just two grammars. The new format is
	*
	* tm = FORMAT OWNER SPAN_LIMIT FILE
	*/
	if (tms.size() == 0 && tm_file != null) {
	tms.add(String.format("%s %s %d %s", tm_format, phrase_owner, span_limit, tm_file));
	tms.add(String.format("%s %s %d %s", glue_format, glue_owner, -1, glue_file));
	}

	if (useGoogleLinearCorpusGain) {
	if (linearCorpusGainThetas == null) {
	logger.info("linearCorpusGainThetas is null, did you set googleBLEUWeights properly?");
	System.exit(1);
	} else if (linearCorpusGainThetas.length != 5) {
	logger
	.info("linearCorpusGainThetas does not have five values, did you set googleBLEUWeights properly?");
	System.exit(1);
	}
	}
	}

	/**
	* Checks for invalid variable configurations
	*/
	public static void sanityCheck() {
	if (pop_limit > 0 && useBeamAndThresholdPrune) {
	System.err
	.println("* FATAL: 'pop-limit' >= 0 is incompatible with 'useBeamAndThresholdPrune'");
	System.exit(0);
	}
	}

	/**
	* Normalizes parameter names by removing underscores and hyphens and lowercasing. This defines
	* equivalence classes on external use of parameter names, permitting arbitrary_under_scores and
	* camelCasing in paramter names without forcing the user to memorize them all. Here are some
	* examples of equivalent ways to refer to parameter names:
	*
	* {pop-limit, poplimit, PopLimit, popLimit, pop_lim_it} {lmfile, lm-file, LM-FILE, lm_file}
	*/
	public static String normalize_key(String text) {
	return text.replaceAll("[-_]", "").toLowerCase();
	}
	}