src/joshua/decoder/JoshuaConfiguration.java - joshua - Git at Google

 package joshua.decoder;

 import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.io.BufferedReader;
 import java.io.FileReader;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.logging.Logger;

 import joshua.decoder.ff.StatefulFF;
 import joshua.decoder.ff.fragmentlm.Tree;
 import joshua.util.FormatUtils;
 import joshua.util.Regex;
 import joshua.util.io.LineReader;

 /**
  * Configuration file for Joshua decoder.
  *
  * When adding new features to Joshua, any new configurable parameters should be added to this
  * class.
  *
  * @author Zhifei Li, <zhifei.work@gmail.com>
  * @author Matt Post <post@cs.jhu.edu>
  */
 public class JoshuaConfiguration {

   // List of grammar files to read
   public ArrayList<String> tms = new ArrayList<String>();

   /*
    * The file to read the weights from (part of the sparse features implementation). Weights can
    * also just be listed in the main config file.
    */
   public String weights_file = "";

   // Default symbols. The symbol here should be enclosed in square brackets.
   public String default_non_terminal = FormatUtils.markup("X");
   public String goal_symbol = FormatUtils.markup("GOAL");

   /*
    * A list of OOV symbols in the form
    *
    * [X1] weight [X2] weight [X3] weight ...
    *
    * where the [X] symbols are nonterminals and the weights are weights. For each OOV word w in the
    * input sentence, Joshua will create rules of the form
    *
    * X1 -> w (weight)
    *
    * If this is empty, an unweighted default_non_terminal is used.
    */

   public class OOVItem implements Comparable<OOVItem> {
     public String label;
     public float weight;

     OOVItem(String l, float w) {
       label = l;
       weight = w;
     }

     @Override
     public int compareTo(OOVItem other) {
       if (weight > other.weight)
         return -1;
       else if (weight < other.weight)
         return 1;
       return 0;
     }
   }
   public ArrayList<OOVItem> oovList = null;

   /*
    * Whether to segment OOVs into a lattice
    */
   public boolean segment_oovs = false;

   /*
    * Enable lattice decoding.
    */
   public boolean lattice_decoding = false;

   /*
    * If false, sorting of the complete grammar is done at load time. If true, grammar tries are not
    * sorted till they are first accessed. Amortized sorting means you get your first translation
    * much, much quicker (good for debugging), but that per-sentence decoding is a bit slower.
    */
   public boolean amortized_sorting = true;

   // syntax-constrained decoding
   public boolean constrain_parse = false;
   public boolean use_pos_labels = false;

   // oov-specific
   public boolean true_oovs_only = false;

   /* Dynamic sentence-level filtering. */
   public boolean filter_grammar = false;

   /* The cube pruning pop limit. Set to 0 for exhaustive pruning. */
   public int pop_limit = 100;

   /* Maximum sentence length. Sentences longer than this are truncated. */
   public int maxlen = 200;

   /*
    * N-best configuration.
    */
   // Make sure output strings in the n-best list are unique.
   public boolean use_unique_nbest = true;

   /* Include the phrasal alignments in the output (not word-level alignmetns at the moment). */
   public boolean include_align_index = false;

   /* The number of hypotheses to output by default. */
   public int topN = 1;

   /**
    * This string describes the format of each line of output from the decoder (i.e., the
    * translations). The string can include arbitrary text and also variables. The following
    * variables are available:
    *
    * <pre>
    * - %i the 0-indexed sentence number
    * - %e the source string %s the translated sentence
    * - %S the translated sentence with some basic capitalization and denormalization
    * - %t the synchronous derivation
    * - %f the list of feature values (as name=value pairs)
    * - %c the model cost
    * - %w the weight vector
    * - %a the alignments between source and target words (currently unimplemented)
    * - %d a verbose, many-line version of the derivation
    * </pre>
    */
   public String outputFormat = "%i ||| %s ||| %f ||| %c";

   /* The number of decoding threads to use (-threads). */
   public int num_parallel_decoders = 1;

   // disk hg
   public String hypergraphFilePattern = "";

   /*
    * When true, _OOV is appended to all words that are passed through (useful for something like
    * transliteration on the target side
    */
   public boolean mark_oovs = false;

   /* Enables synchronous parsing. */
   public boolean parse = false; // perform synchronous parsing

   private final Logger logger = Logger.getLogger(JoshuaConfiguration.class.getName());

   /* A list of the feature functions. */
   public ArrayList<String> features = new ArrayList<String>();

   /* A list of weights found in the main config file (instead of in a separate weights file) */
   public ArrayList<String> weights = new ArrayList<String>();

   /* If set, Joshua will start a (multi-threaded, per "threads") TCP/IP server on this port. */
   public int server_port = 0;

   /*
    * Whether to do forest rescoring. If set to true, the references are expected on STDIN along with
    * the input sentences in the following format:
    *
    * input sentence ||| ||| reference1 ||| reference2 ...
    *
    * (The second field is reserved for the output sentence for alignment and forced decoding).
    */

   public boolean rescoreForest = false;
   public float rescoreForestWeight = 10.0f;

   /*
    * Location of fragment mapping file, which maps flattened SCFG rules to their internal
    * representation.
    */
   public String fragmentMapFile = null;

   /*
    * Whether to use soft syntactic constraint decoding /fuzzy matching, which allows that any
    * nonterminal may be substituted for any other nonterminal (except for OOV and GOAL)
    */
   public boolean fuzzy_matching = false;

   public static final String SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME = "fuzzy_matching";

   /***
    * Phrase-based decoding parameters.
    */

   /* The search algorithm: currently either "cky" or "stack" */
   public String search_algorithm = "cky";

   /* The distortion limit */
   public int reordering_limit = 8;

   /* The number of target sides considered for each source side (after sorting by model weight) */
   public int num_translation_options = 20;

   /* If true, decode using a dot chart (standard CKY+); if false, use the much more efficient
    * version of Sennrich (SSST 2014)
    */
   public boolean use_dot_chart = true;

   /* Moses compatibility */
   public boolean moses = false;

   /* If true, just print out the weights found in the config file, and exit. */
   public boolean show_weights_and_quit = false;

   /* Read input from a file (Moses compatible flag) */
   public String input_file = null;

   /* Write n-best output to this file */
   public String n_best_file = null;

   /* Whether to look at source side for special annotations */
   public boolean source_annotations = false;

   /* Weights overridden from the command line */
   public String weight_overwrite = "";

   /**
    * This method resets the state of JoshuaConfiguration back to the state after initialization.
    * This is useful when for example making different calls to the decoder within the same java
    * program, which otherwise leads to potential errors due to inconsistent state as a result of
    * loading the configuration multiple times without resetting etc.
    *
    * This leads to the insight that in fact it may be an even better idea to refactor the code and
    * make JoshuaConfiguration an object that is is created and passed as an argument, rather than a
    * shared static object. This is just a suggestion for the next step.
    *
    */
   public void reset() {
     logger.info("Resetting the JoshuaConfiguration to its defaults ...");
     logger.info("\n\tResetting the StatefullFF global state index ...");
     logger.info("\n\t...done");
     StatefulFF.resetGlobalStateIndex();
     tms = new ArrayList<String>();
     weights_file = "";
     default_non_terminal = "[X]";
     oovList = new ArrayList<OOVItem>();
     oovList.add(new OOVItem(default_non_terminal, 1.0f));
     goal_symbol = "[GOAL]";
     amortized_sorting = true;
     constrain_parse = false;
     use_pos_labels = false;
     true_oovs_only = false;
     filter_grammar = false;
     pop_limit = 100;
     maxlen = 200;
     use_unique_nbest = false;
     include_align_index = false;
     topN = 1;
     outputFormat = "%i ||| %s ||| %f ||| %c";
     num_parallel_decoders = 1;
     hypergraphFilePattern = "";
     mark_oovs = false;
     // oracleFile = null;
     parse = false; // perform synchronous parsing
     features = new ArrayList<String>();
     weights = new ArrayList<String>();
     server_port = 0;

     reordering_limit = 8;
     num_translation_options = 20;
     logger.info("...done");
   }

   // ===============================================================
   // Methods
   // ===============================================================

   /**
    * To process command-line options, we write them to a file that looks like the config file, and
    * then call readConfigFile() on it. It would be more general to define a class that sits on a
    * stream and knows how to chop it up, but this was quicker to implement.
    */
   public void processCommandLineOptions(String[] options) {
     try {
       File tmpFile = File.createTempFile("options", null, null);
       PrintWriter out = new PrintWriter(new FileWriter(tmpFile));

       for (int i = 0; i < options.length; i++) {
         String key = options[i].substring(1);
         if (i + 1 == options.length || options[i + 1].startsWith("-")) {
           // if this is the last item, or if the next item
           // is another flag, then this is a boolean flag
           out.println(key + " = true");

         } else {
           out.print(key + " =");
           while (i + 1 < options.length && ! options[i + 1].startsWith("-")) {
             out.print(String.format(" %s", options[i + 1]));
             i++;
           }
           out.println();
         }
       }
       out.close();
       this.readConfigFile(tmpFile.getCanonicalPath());

       tmpFile.delete();

     } catch (IOException e) {
       e.printStackTrace();
       System.exit(1);
     }
   }

   public void readConfigFile(String configFile) throws IOException {

     LineReader configReader = new LineReader(configFile, false);
     try {
       for (String line : configReader) {
         line = line.trim(); // .toLowerCase();

         if (Regex.commentOrEmptyLine.matches(line))
           continue;

         /*
          * There are two kinds of substantive (non-comment, non-blank) lines: parameters and feature
          * values. Parameters match the pattern "key = value"; all other substantive lines are
          * interpreted as features.
          */

         if (line.indexOf("=") != -1) { // parameters; (not feature function)
           String[] fds = Regex.equalsWithSpaces.split(line, 2);
           if (fds.length < 2) {
             Decoder.LOG(1, String.format("* WARNING: skipping config file line '%s'", line));
             continue;
           }

           String parameter = normalize_key(fds[0]);

           if (parameter.equals(normalize_key("lm"))) {
             /* This is deprecated. This support old LM lines of the form
              *
              *   lm = berkeleylm 5 false false 100 lm.gz
              *
              * LMs are now loaded as general feature functions, so we transform that to either
              *
              *   feature-function = LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
              *
              * If the line were state minimizing:
              *
              *   lm = kenlm 5 true false 100 lm.gz
              *
              * feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
              */

             String[] tokens = fds[1].split("\\s+");
             if (tokens[2].equals("true"))
               features.add(String.format("feature_function = StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
                   tokens[1], tokens[5]));
             else
               features.add(String.format("feature_function = LanguageModel -lm_type %s -lm_order %s -lm_file %s",
                   tokens[0], tokens[1], tokens[5]));

           } else if (parameter.equals(normalize_key("tm"))) {
             /* If found, convert old format:
              *   tm = TYPE OWNER MAXSPAN PATH
              * to new format
              *   tm = TYPE -owner OWNER -maxspan MAXSPAN -path PATH
              */
             String tmLine = fds[1];

             String[] tokens = fds[1].split("\\s+");
             if (! tokens[1].startsWith("-")) { // old format
               tmLine = String.format("%s -owner %s -maxspan %s -path %s", tokens[0], tokens[1], tokens[2], tokens[3]);
               Decoder.LOG(1, String.format("WARNING: Converting deprecated TM line from '%s' -> '%s'", fds[1], tmLine));
             }
             tms.add(tmLine);

           } else if (parameter.equals("v")) {
             Decoder.VERBOSE = Integer.parseInt(fds[1]);

           } else if (parameter.equals(normalize_key("parse"))) {
             parse = Boolean.parseBoolean(fds[1]);
             logger.finest(String.format("parse: %s", parse));

           } else if (parameter.equals(normalize_key("dump-hypergraph"))) {
             hypergraphFilePattern = fds[1].trim();
             logger
                 .finest(String.format("  hypergraph dump file format: %s", hypergraphFilePattern));

           } else if (parameter.equals(normalize_key("oov-list"))) {
             if (new File(fds[1]).exists()) {
               oovList = new ArrayList<OOVItem>();
               try {
                 File file = new File(fds[1]);
                 BufferedReader br = new BufferedReader(new FileReader(file));
                 try {
                   String str = br.readLine();
                   while (str != null) {
                     String[] tokens = str.trim().split("\\s+");

                     oovList.add(new OOVItem(FormatUtils.markup(tokens[0]),
                             (float) Math.log(Float.parseFloat(tokens[1]))));

                     str = br.readLine();
                   }
                   br.close();
                 } catch(IOException e){
                   System.out.println(e);
                 }
               } catch(IOException e){
                 System.out.println(e);
               }
               Collections.sort(oovList);

             } else {
               String[] tokens = fds[1].trim().split("\\s+");
               if (tokens.length % 2 != 0) {
                   System.err.println(String.format("* FATAL: invalid format for '%s'", fds[0]));
                   System.exit(1);
                 }

               oovList = new ArrayList<OOVItem>();

               for (int i = 0; i < tokens.length; i += 2)
                 oovList.add(new OOVItem(FormatUtils.markup(tokens[i]),
                     (float) Math.log(Float.parseFloat(tokens[i + 1]))));

               Collections.sort(oovList);
             }

           } else if (parameter.equals(normalize_key("lattice-decoding"))) {
             lattice_decoding = true;

           } else if (parameter.equals(normalize_key("segment-oovs"))) {
             segment_oovs = true;
             lattice_decoding = true;

           } else if (parameter.equals(normalize_key("default-non-terminal"))) {
             default_non_terminal = String.format("[%s]", FormatUtils.cleanNonterminal(fds[1].trim()));
             logger.finest(String.format("default_non_terminal: %s", default_non_terminal));

           } else if (parameter.equals(normalize_key("goal-symbol"))) {
             goal_symbol = String.format("[%s]", FormatUtils.cleanNonterminal(fds[1].trim()));
             logger.finest("goalSymbol: " + goal_symbol);

           } else if (parameter.equals(normalize_key("weights-file"))) {
             weights_file = fds[1];

           } else if (parameter.equals(normalize_key("constrain_parse"))) {
             constrain_parse = Boolean.parseBoolean(fds[1]);

           } else if (parameter.equals(normalize_key("true_oovs_only"))) {
             true_oovs_only = Boolean.parseBoolean(fds[1]);

           } else if (parameter.equals(normalize_key("filter-grammar"))) {
             filter_grammar = Boolean.parseBoolean(fds[1]);

           } else if (parameter.equals(normalize_key("amortize"))) {
             amortized_sorting = Boolean.parseBoolean(fds[1]);

           } else if (parameter.equals(normalize_key("use_pos_labels"))) {
             use_pos_labels = Boolean.parseBoolean(fds[1]);

           } else if (parameter.equals(normalize_key("use_unique_nbest"))) {
             use_unique_nbest = Boolean.valueOf(fds[1]);
             logger.finest(String.format("use_unique_nbest: %s", use_unique_nbest));

           } else if (parameter.equals(normalize_key("output-format"))) {
             outputFormat = fds[1];
             logger.finest(String.format("output-format: %s", outputFormat));

           } else if (parameter.equals(normalize_key("include_align_index"))) {
             include_align_index = Boolean.valueOf(fds[1]);
             logger.finest(String.format("include_align_index: %s", include_align_index));

           } else if (parameter.equals(normalize_key("top_n"))) {
             topN = Integer.parseInt(fds[1]);
             logger.finest(String.format("topN: %s", topN));

           } else if (parameter.equals(normalize_key("num_parallel_decoders"))
               || parameter.equals(normalize_key("threads"))) {
             num_parallel_decoders = Integer.parseInt(fds[1]);
             if (num_parallel_decoders <= 0) {
               throw new IllegalArgumentException(
                   "Must specify a positive number for num_parallel_decoders");
             }
             logger.finest(String.format("num_parallel_decoders: %s", num_parallel_decoders));

           } else if (parameter.equals(normalize_key("mark_oovs"))) {
             mark_oovs = Boolean.valueOf(fds[1]);
             logger.finest(String.format("mark_oovs: %s", mark_oovs));

           } else if (parameter.equals(normalize_key("pop-limit"))) {
             pop_limit = Integer.valueOf(fds[1]);
             logger.finest(String.format("pop-limit: %s", pop_limit));

           } else if (parameter.equals(normalize_key("server-port"))) {
             server_port = Integer.parseInt(fds[1]);
             logger.info(String.format("    server-port: %d", server_port));

           } else if (parameter.equals(normalize_key("rescore-forest"))) {
             rescoreForest = true;
             logger.info(String.format("    rescore-forest: %s", rescoreForest));

           } else if (parameter.equals(normalize_key("rescore-forest-weight"))) {
             rescoreForestWeight = Float.parseFloat(fds[1]);
             logger.info(String.format("    rescore-forest-weight: %f", rescoreForestWeight));

           } else if (parameter.equals(normalize_key("maxlen"))) {
             // reset the maximum length
             maxlen = Integer.parseInt(fds[1]);

           } else if (parameter.equals("c") || parameter.equals("config")) {
             // this was used to send in the config file, just ignore it
             ;

           } else if (parameter.equals(normalize_key("feature-function"))) {
             // add the feature to the list of features for later processing
             features.add("feature_function = " + fds[1]);

           } else if (parameter.equals(normalize_key("maxlen"))) {
             // add the feature to the list of features for later processing
             maxlen = Integer.parseInt(fds[1]);

           } else if (parameter
               .equals(normalize_key(SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME))) {
             fuzzy_matching = Boolean.parseBoolean(fds[1]);
             logger.finest(String.format(fuzzy_matching + ": %s", fuzzy_matching));

           } else if (parameter.equals(normalize_key("fragment-map"))) {
             fragmentMapFile = fds[1];
             Tree.readMapping(fragmentMapFile);

           /** PHRASE-BASED PARAMETERS **/
           } else if (parameter.equals(normalize_key("search"))) {
             search_algorithm = fds[1];

             if (!search_algorithm.equals("cky") && !search_algorithm.equals("stack")) {
               System.err.println("* FATAL: -search must be one of 'stack' (for phrase-based decoding)");
               System.err.println("*   or 'cky' (for hierarchical / syntactic decoding)");
               System.exit(1);
             }

           } else if (parameter.equals(normalize_key("reordering-limit"))) {
             reordering_limit = Integer.parseInt(fds[1]);

           } else if (parameter.equals(normalize_key("num-translation-options"))) {
             num_translation_options = Integer.parseInt(fds[1]);

           } else if (parameter.equals(normalize_key("no-dot-chart"))) {
             use_dot_chart = false;

           } else if (parameter.equals(normalize_key("moses"))) {
             moses = true; // triggers some Moses-specific compatibility options

           } else if (parameter.equals(normalize_key("show-weights"))) {
             show_weights_and_quit = true;

           } else if (parameter.equals(normalize_key("input-type"))) {
             ; // for Moses compatibility; ignore this

           } else if (parameter.equals(normalize_key("n-best-list"))) {
             // for Moses compatibility
             String[] tokens = fds[1].split("\\s+");
             n_best_file = tokens[0];
             if (tokens.length > 1)
               topN = Integer.parseInt(tokens[1]);

           } else if (parameter.equals(normalize_key("input-file"))) {
             // for Moses compatibility
             input_file = fds[1];

           } else if (parameter.equals(normalize_key("weight-file"))) {
             // for Moses, ignore

           } else if (parameter.equals(normalize_key("weight-overwrite"))) {
             weight_overwrite = fds[1];

           } else if (parameter.equals(normalize_key("source-annotations"))) {
             // Check source sentence
             source_annotations = true;

           } else {

             if (parameter.equals(normalize_key("use-sent-specific-tm"))
                 || parameter.equals(normalize_key("add-combined-cost"))
                 || parameter.equals(normalize_key("use-tree-nbest"))
                 || parameter.equals(normalize_key("use-kenlm"))
                 || parameter.equals(normalize_key("useCubePrune"))
                 || parameter.equals(normalize_key("useBeamAndThresholdPrune"))
                 || parameter.equals(normalize_key("regexp-grammar"))) {
               logger.warning(String.format("WARNING: ignoring deprecated parameter '%s'", fds[0]));

             } else {
               logger.warning("FATAL: unknown configuration parameter '" + fds[0] + "'");
               System.exit(1);
             }
           }

           Decoder.LOG(1, String.format("    %s = '%s'", normalize_key(fds[0]), fds[1]));

         } else {
           /*
            * Lines that don't have an equals sign and are not blank lines, empty lines, or comments,
            * are feature values, which can be present in this file
            */

           weights.add(line);
         }
       }
     } finally {
       configReader.close();
     }
   }

   /**
    * Checks for invalid variable configurations
    */
   public void sanityCheck() {
   }

   /**
    * Normalizes parameter names by removing underscores and hyphens and lowercasing. This defines
    * equivalence classes on external use of parameter names, permitting arbitrary_under_scores and
    * camelCasing in paramter names without forcing the user to memorize them all. Here are some
    * examples of equivalent ways to refer to parameter names:
    *
    * {pop-limit, poplimit, PopLimit, popLimit, pop_lim_it} {lmfile, lm-file, LM-FILE, lm_file}
    */
   public static String normalize_key(String text) {
     return text.replaceAll("[-_]", "").toLowerCase();
   }
 }
	package joshua.decoder;

	import java.io.File;
	import java.io.FileWriter;
	import java.io.IOException;
	import java.io.PrintWriter;
	import java.io.BufferedReader;
	import java.io.FileReader;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.logging.Logger;

	import joshua.decoder.ff.StatefulFF;
	import joshua.decoder.ff.fragmentlm.Tree;
	import joshua.util.FormatUtils;
	import joshua.util.Regex;
	import joshua.util.io.LineReader;

	/**
	* Configuration file for Joshua decoder.
	*
	* When adding new features to Joshua, any new configurable parameters should be added to this
	* class.
	*
	* @author Zhifei Li, <zhifei.work@gmail.com>
	* @author Matt Post <post@cs.jhu.edu>
	*/
	public class JoshuaConfiguration {

	// List of grammar files to read
	public ArrayList<String> tms = new ArrayList<String>();

	/*
	* The file to read the weights from (part of the sparse features implementation). Weights can
	* also just be listed in the main config file.
	*/
	public String weights_file = "";

	// Default symbols. The symbol here should be enclosed in square brackets.
	public String default_non_terminal = FormatUtils.markup("X");
	public String goal_symbol = FormatUtils.markup("GOAL");

	/*
	* A list of OOV symbols in the form
	*
	* [X1] weight [X2] weight [X3] weight ...
	*
	* where the [X] symbols are nonterminals and the weights are weights. For each OOV word w in the
	* input sentence, Joshua will create rules of the form
	*
	* X1 -> w (weight)
	*
	* If this is empty, an unweighted default_non_terminal is used.
	*/

	public class OOVItem implements Comparable<OOVItem> {
	public String label;
	public float weight;

	OOVItem(String l, float w) {
	label = l;
	weight = w;
	}

	@Override
	public int compareTo(OOVItem other) {
	if (weight > other.weight)
	return -1;
	else if (weight < other.weight)
	return 1;
	return 0;
	}
	}
	public ArrayList<OOVItem> oovList = null;

	/*
	* Whether to segment OOVs into a lattice
	*/
	public boolean segment_oovs = false;

	/*
	* Enable lattice decoding.
	*/
	public boolean lattice_decoding = false;

	/*
	* If false, sorting of the complete grammar is done at load time. If true, grammar tries are not
	* sorted till they are first accessed. Amortized sorting means you get your first translation
	* much, much quicker (good for debugging), but that per-sentence decoding is a bit slower.
	*/
	public boolean amortized_sorting = true;

	// syntax-constrained decoding
	public boolean constrain_parse = false;
	public boolean use_pos_labels = false;

	// oov-specific
	public boolean true_oovs_only = false;

	/* Dynamic sentence-level filtering. */
	public boolean filter_grammar = false;

	/* The cube pruning pop limit. Set to 0 for exhaustive pruning. */
	public int pop_limit = 100;

	/* Maximum sentence length. Sentences longer than this are truncated. */
	public int maxlen = 200;

	/*
	* N-best configuration.
	*/
	// Make sure output strings in the n-best list are unique.
	public boolean use_unique_nbest = true;

	/* Include the phrasal alignments in the output (not word-level alignmetns at the moment). */
	public boolean include_align_index = false;

	/* The number of hypotheses to output by default. */
	public int topN = 1;

	/**
	* This string describes the format of each line of output from the decoder (i.e., the
	* translations). The string can include arbitrary text and also variables. The following
	* variables are available:
	*
	* <pre>
	* - %i the 0-indexed sentence number
	* - %e the source string %s the translated sentence
	* - %S the translated sentence with some basic capitalization and denormalization
	* - %t the synchronous derivation
	* - %f the list of feature values (as name=value pairs)
	* - %c the model cost
	* - %w the weight vector
	* - %a the alignments between source and target words (currently unimplemented)
	* - %d a verbose, many-line version of the derivation
	* </pre>
	*/
	public String outputFormat = "%i \|\|\| %s \|\|\| %f \|\|\| %c";

	/* The number of decoding threads to use (-threads). */
	public int num_parallel_decoders = 1;

	// disk hg
	public String hypergraphFilePattern = "";

	/*
	* When true, _OOV is appended to all words that are passed through (useful for something like
	* transliteration on the target side
	*/
	public boolean mark_oovs = false;

	/* Enables synchronous parsing. */
	public boolean parse = false; // perform synchronous parsing

	private final Logger logger = Logger.getLogger(JoshuaConfiguration.class.getName());

	/* A list of the feature functions. */
	public ArrayList<String> features = new ArrayList<String>();

	/* A list of weights found in the main config file (instead of in a separate weights file) */
	public ArrayList<String> weights = new ArrayList<String>();

	/* If set, Joshua will start a (multi-threaded, per "threads") TCP/IP server on this port. */
	public int server_port = 0;

	/*
	* Whether to do forest rescoring. If set to true, the references are expected on STDIN along with
	* the input sentences in the following format:
	*
	* input sentence \|\|\| \|\|\| reference1 \|\|\| reference2 ...
	*
	* (The second field is reserved for the output sentence for alignment and forced decoding).
	*/

	public boolean rescoreForest = false;
	public float rescoreForestWeight = 10.0f;

	/*
	* Location of fragment mapping file, which maps flattened SCFG rules to their internal
	* representation.
	*/
	public String fragmentMapFile = null;

	/*
	* Whether to use soft syntactic constraint decoding /fuzzy matching, which allows that any
	* nonterminal may be substituted for any other nonterminal (except for OOV and GOAL)
	*/
	public boolean fuzzy_matching = false;

	public static final String SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME = "fuzzy_matching";

	/***
	* Phrase-based decoding parameters.
	*/

	/* The search algorithm: currently either "cky" or "stack" */
	public String search_algorithm = "cky";

	/* The distortion limit */
	public int reordering_limit = 8;

	/* The number of target sides considered for each source side (after sorting by model weight) */
	public int num_translation_options = 20;

	/* If true, decode using a dot chart (standard CKY+); if false, use the much more efficient
	* version of Sennrich (SSST 2014)
	*/
	public boolean use_dot_chart = true;

	/* Moses compatibility */
	public boolean moses = false;

	/* If true, just print out the weights found in the config file, and exit. */
	public boolean show_weights_and_quit = false;

	/* Read input from a file (Moses compatible flag) */
	public String input_file = null;

	/* Write n-best output to this file */
	public String n_best_file = null;

	/* Whether to look at source side for special annotations */
	public boolean source_annotations = false;

	/* Weights overridden from the command line */
	public String weight_overwrite = "";

	/**
	* This method resets the state of JoshuaConfiguration back to the state after initialization.
	* This is useful when for example making different calls to the decoder within the same java
	* program, which otherwise leads to potential errors due to inconsistent state as a result of
	* loading the configuration multiple times without resetting etc.
	*
	* This leads to the insight that in fact it may be an even better idea to refactor the code and
	* make JoshuaConfiguration an object that is is created and passed as an argument, rather than a
	* shared static object. This is just a suggestion for the next step.
	*
	*/
	public void reset() {
	logger.info("Resetting the JoshuaConfiguration to its defaults ...");
	logger.info("\n\tResetting the StatefullFF global state index ...");
	logger.info("\n\t...done");
	StatefulFF.resetGlobalStateIndex();
	tms = new ArrayList<String>();
	weights_file = "";
	default_non_terminal = "[X]";
	oovList = new ArrayList<OOVItem>();
	oovList.add(new OOVItem(default_non_terminal, 1.0f));
	goal_symbol = "[GOAL]";
	amortized_sorting = true;
	constrain_parse = false;
	use_pos_labels = false;
	true_oovs_only = false;
	filter_grammar = false;
	pop_limit = 100;
	maxlen = 200;
	use_unique_nbest = false;
	include_align_index = false;
	topN = 1;
	outputFormat = "%i \|\|\| %s \|\|\| %f \|\|\| %c";
	num_parallel_decoders = 1;
	hypergraphFilePattern = "";
	mark_oovs = false;
	// oracleFile = null;
	parse = false; // perform synchronous parsing
	features = new ArrayList<String>();
	weights = new ArrayList<String>();
	server_port = 0;

	reordering_limit = 8;
	num_translation_options = 20;
	logger.info("...done");
	}

	// ===============================================================
	// Methods
	// ===============================================================

	/**
	* To process command-line options, we write them to a file that looks like the config file, and
	* then call readConfigFile() on it. It would be more general to define a class that sits on a
	* stream and knows how to chop it up, but this was quicker to implement.
	*/
	public void processCommandLineOptions(String[] options) {
	try {
	File tmpFile = File.createTempFile("options", null, null);
	PrintWriter out = new PrintWriter(new FileWriter(tmpFile));

	for (int i = 0; i < options.length; i++) {
	String key = options[i].substring(1);
	if (i + 1 == options.length \|\| options[i + 1].startsWith("-")) {
	// if this is the last item, or if the next item
	// is another flag, then this is a boolean flag
	out.println(key + " = true");

	} else {
	out.print(key + " =");
	while (i + 1 < options.length && ! options[i + 1].startsWith("-")) {
	out.print(String.format(" %s", options[i + 1]));
	i++;
	}
	out.println();
	}
	}
	out.close();
	this.readConfigFile(tmpFile.getCanonicalPath());

	tmpFile.delete();

	} catch (IOException e) {
	e.printStackTrace();
	System.exit(1);
	}
	}

	public void readConfigFile(String configFile) throws IOException {

	LineReader configReader = new LineReader(configFile, false);
	try {
	for (String line : configReader) {
	line = line.trim(); // .toLowerCase();

	if (Regex.commentOrEmptyLine.matches(line))
	continue;

	/*
	* There are two kinds of substantive (non-comment, non-blank) lines: parameters and feature
	* values. Parameters match the pattern "key = value"; all other substantive lines are
	* interpreted as features.
	*/

	if (line.indexOf("=") != -1) { // parameters; (not feature function)
	String[] fds = Regex.equalsWithSpaces.split(line, 2);
	if (fds.length < 2) {
	Decoder.LOG(1, String.format("* WARNING: skipping config file line '%s'", line));
	continue;
	}

	String parameter = normalize_key(fds[0]);

	if (parameter.equals(normalize_key("lm"))) {
	/* This is deprecated. This support old LM lines of the form
	*
	* lm = berkeleylm 5 false false 100 lm.gz
	*
	* LMs are now loaded as general feature functions, so we transform that to either
	*
	* feature-function = LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
	*
	* If the line were state minimizing:
	*
	* lm = kenlm 5 true false 100 lm.gz
	*
	* feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
	*/

	String[] tokens = fds[1].split("\\s+");
	if (tokens[2].equals("true"))
	features.add(String.format("feature_function = StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
	tokens[1], tokens[5]));
	else
	features.add(String.format("feature_function = LanguageModel -lm_type %s -lm_order %s -lm_file %s",
	tokens[0], tokens[1], tokens[5]));

	} else if (parameter.equals(normalize_key("tm"))) {
	/* If found, convert old format:
	* tm = TYPE OWNER MAXSPAN PATH
	* to new format
	* tm = TYPE -owner OWNER -maxspan MAXSPAN -path PATH
	*/
	String tmLine = fds[1];

	String[] tokens = fds[1].split("\\s+");
	if (! tokens[1].startsWith("-")) { // old format
	tmLine = String.format("%s -owner %s -maxspan %s -path %s", tokens[0], tokens[1], tokens[2], tokens[3]);
	Decoder.LOG(1, String.format("WARNING: Converting deprecated TM line from '%s' -> '%s'", fds[1], tmLine));
	}
	tms.add(tmLine);

	} else if (parameter.equals("v")) {
	Decoder.VERBOSE = Integer.parseInt(fds[1]);

	} else if (parameter.equals(normalize_key("parse"))) {
	parse = Boolean.parseBoolean(fds[1]);
	logger.finest(String.format("parse: %s", parse));

	} else if (parameter.equals(normalize_key("dump-hypergraph"))) {
	hypergraphFilePattern = fds[1].trim();
	logger
	.finest(String.format(" hypergraph dump file format: %s", hypergraphFilePattern));

	} else if (parameter.equals(normalize_key("oov-list"))) {
	if (new File(fds[1]).exists()) {
	oovList = new ArrayList<OOVItem>();
	try {
	File file = new File(fds[1]);
	BufferedReader br = new BufferedReader(new FileReader(file));
	try {
	String str = br.readLine();
	while (str != null) {
	String[] tokens = str.trim().split("\\s+");

	oovList.add(new OOVItem(FormatUtils.markup(tokens[0]),
	(float) Math.log(Float.parseFloat(tokens[1]))));

	str = br.readLine();
	}
	br.close();
	} catch(IOException e){
	System.out.println(e);
	}
	} catch(IOException e){
	System.out.println(e);
	}
	Collections.sort(oovList);

	} else {
	String[] tokens = fds[1].trim().split("\\s+");
	if (tokens.length % 2 != 0) {
	System.err.println(String.format("* FATAL: invalid format for '%s'", fds[0]));
	System.exit(1);
	}

	oovList = new ArrayList<OOVItem>();

	for (int i = 0; i < tokens.length; i += 2)
	oovList.add(new OOVItem(FormatUtils.markup(tokens[i]),
	(float) Math.log(Float.parseFloat(tokens[i + 1]))));

	Collections.sort(oovList);
	}

	} else if (parameter.equals(normalize_key("lattice-decoding"))) {
	lattice_decoding = true;

	} else if (parameter.equals(normalize_key("segment-oovs"))) {
	segment_oovs = true;
	lattice_decoding = true;

	} else if (parameter.equals(normalize_key("default-non-terminal"))) {
	default_non_terminal = String.format("[%s]", FormatUtils.cleanNonterminal(fds[1].trim()));
	logger.finest(String.format("default_non_terminal: %s", default_non_terminal));

	} else if (parameter.equals(normalize_key("goal-symbol"))) {
	goal_symbol = String.format("[%s]", FormatUtils.cleanNonterminal(fds[1].trim()));
	logger.finest("goalSymbol: " + goal_symbol);

	} else if (parameter.equals(normalize_key("weights-file"))) {
	weights_file = fds[1];

	} else if (parameter.equals(normalize_key("constrain_parse"))) {
	constrain_parse = Boolean.parseBoolean(fds[1]);

	} else if (parameter.equals(normalize_key("true_oovs_only"))) {
	true_oovs_only = Boolean.parseBoolean(fds[1]);

	} else if (parameter.equals(normalize_key("filter-grammar"))) {
	filter_grammar = Boolean.parseBoolean(fds[1]);

	} else if (parameter.equals(normalize_key("amortize"))) {
	amortized_sorting = Boolean.parseBoolean(fds[1]);

	} else if (parameter.equals(normalize_key("use_pos_labels"))) {
	use_pos_labels = Boolean.parseBoolean(fds[1]);

	} else if (parameter.equals(normalize_key("use_unique_nbest"))) {
	use_unique_nbest = Boolean.valueOf(fds[1]);
	logger.finest(String.format("use_unique_nbest: %s", use_unique_nbest));

	} else if (parameter.equals(normalize_key("output-format"))) {
	outputFormat = fds[1];
	logger.finest(String.format("output-format: %s", outputFormat));

	} else if (parameter.equals(normalize_key("include_align_index"))) {
	include_align_index = Boolean.valueOf(fds[1]);
	logger.finest(String.format("include_align_index: %s", include_align_index));

	} else if (parameter.equals(normalize_key("top_n"))) {
	topN = Integer.parseInt(fds[1]);
	logger.finest(String.format("topN: %s", topN));

	} else if (parameter.equals(normalize_key("num_parallel_decoders"))
	\|\| parameter.equals(normalize_key("threads"))) {
	num_parallel_decoders = Integer.parseInt(fds[1]);
	if (num_parallel_decoders <= 0) {
	throw new IllegalArgumentException(
	"Must specify a positive number for num_parallel_decoders");
	}
	logger.finest(String.format("num_parallel_decoders: %s", num_parallel_decoders));

	} else if (parameter.equals(normalize_key("mark_oovs"))) {
	mark_oovs = Boolean.valueOf(fds[1]);
	logger.finest(String.format("mark_oovs: %s", mark_oovs));

	} else if (parameter.equals(normalize_key("pop-limit"))) {
	pop_limit = Integer.valueOf(fds[1]);
	logger.finest(String.format("pop-limit: %s", pop_limit));

	} else if (parameter.equals(normalize_key("server-port"))) {
	server_port = Integer.parseInt(fds[1]);
	logger.info(String.format(" server-port: %d", server_port));

	} else if (parameter.equals(normalize_key("rescore-forest"))) {
	rescoreForest = true;
	logger.info(String.format(" rescore-forest: %s", rescoreForest));

	} else if (parameter.equals(normalize_key("rescore-forest-weight"))) {
	rescoreForestWeight = Float.parseFloat(fds[1]);
	logger.info(String.format(" rescore-forest-weight: %f", rescoreForestWeight));

	} else if (parameter.equals(normalize_key("maxlen"))) {
	// reset the maximum length
	maxlen = Integer.parseInt(fds[1]);

	} else if (parameter.equals("c") \|\| parameter.equals("config")) {
	// this was used to send in the config file, just ignore it
	;

	} else if (parameter.equals(normalize_key("feature-function"))) {
	// add the feature to the list of features for later processing
	features.add("feature_function = " + fds[1]);

	} else if (parameter.equals(normalize_key("maxlen"))) {
	// add the feature to the list of features for later processing
	maxlen = Integer.parseInt(fds[1]);

	} else if (parameter
	.equals(normalize_key(SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME))) {
	fuzzy_matching = Boolean.parseBoolean(fds[1]);
	logger.finest(String.format(fuzzy_matching + ": %s", fuzzy_matching));

	} else if (parameter.equals(normalize_key("fragment-map"))) {
	fragmentMapFile = fds[1];
	Tree.readMapping(fragmentMapFile);

	/ PHRASE-BASED PARAMETERS /
	} else if (parameter.equals(normalize_key("search"))) {
	search_algorithm = fds[1];

	if (!search_algorithm.equals("cky") && !search_algorithm.equals("stack")) {
	System.err.println("* FATAL: -search must be one of 'stack' (for phrase-based decoding)");
	System.err.println("* or 'cky' (for hierarchical / syntactic decoding)");
	System.exit(1);
	}

	} else if (parameter.equals(normalize_key("reordering-limit"))) {
	reordering_limit = Integer.parseInt(fds[1]);

	} else if (parameter.equals(normalize_key("num-translation-options"))) {
	num_translation_options = Integer.parseInt(fds[1]);

	} else if (parameter.equals(normalize_key("no-dot-chart"))) {
	use_dot_chart = false;

	} else if (parameter.equals(normalize_key("moses"))) {
	moses = true; // triggers some Moses-specific compatibility options

	} else if (parameter.equals(normalize_key("show-weights"))) {
	show_weights_and_quit = true;

	} else if (parameter.equals(normalize_key("input-type"))) {
	; // for Moses compatibility; ignore this

	} else if (parameter.equals(normalize_key("n-best-list"))) {
	// for Moses compatibility
	String[] tokens = fds[1].split("\\s+");
	n_best_file = tokens[0];
	if (tokens.length > 1)
	topN = Integer.parseInt(tokens[1]);

	} else if (parameter.equals(normalize_key("input-file"))) {
	// for Moses compatibility
	input_file = fds[1];

	} else if (parameter.equals(normalize_key("weight-file"))) {
	// for Moses, ignore

	} else if (parameter.equals(normalize_key("weight-overwrite"))) {
	weight_overwrite = fds[1];

	} else if (parameter.equals(normalize_key("source-annotations"))) {
	// Check source sentence
	source_annotations = true;

	} else {

	if (parameter.equals(normalize_key("use-sent-specific-tm"))
	\|\| parameter.equals(normalize_key("add-combined-cost"))
	\|\| parameter.equals(normalize_key("use-tree-nbest"))
	\|\| parameter.equals(normalize_key("use-kenlm"))
	\|\| parameter.equals(normalize_key("useCubePrune"))
	\|\| parameter.equals(normalize_key("useBeamAndThresholdPrune"))
	\|\| parameter.equals(normalize_key("regexp-grammar"))) {
	logger.warning(String.format("WARNING: ignoring deprecated parameter '%s'", fds[0]));

	} else {
	logger.warning("FATAL: unknown configuration parameter '" + fds[0] + "'");
	System.exit(1);
	}
	}

	Decoder.LOG(1, String.format(" %s = '%s'", normalize_key(fds[0]), fds[1]));

	} else {
	/*
	* Lines that don't have an equals sign and are not blank lines, empty lines, or comments,
	* are feature values, which can be present in this file
	*/

	weights.add(line);
	}
	}
	} finally {
	configReader.close();
	}
	}

	/**
	* Checks for invalid variable configurations
	*/
	public void sanityCheck() {
	}

	/**
	* Normalizes parameter names by removing underscores and hyphens and lowercasing. This defines
	* equivalence classes on external use of parameter names, permitting arbitrary_under_scores and
	* camelCasing in paramter names without forcing the user to memorize them all. Here are some
	* examples of equivalent ways to refer to parameter names:
	*
	* {pop-limit, poplimit, PopLimit, popLimit, pop_lim_it} {lmfile, lm-file, LM-FILE, lm_file}
	*/
	public static String normalize_key(String text) {
	return text.replaceAll("[-_]", "").toLowerCase();
	}
	}