src/joshua/decoder/Decoder.java - joshua - Git at Google

 package joshua.decoder;

 import java.io.BufferedWriter;
 import java.io.File;
 import java.io.IOException;
 import java.io.FileNotFoundException;
 import java.lang.reflect.Constructor;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;
 import java.util.logging.Logger;

 import joshua.corpus.Vocabulary;
 import joshua.decoder.ff.FeatureVector;
 import joshua.decoder.ff.FeatureFunction;
 import joshua.decoder.ff.ArityPhrasePenaltyFF;
 import joshua.decoder.ff.LabelCombinationFF;
 import joshua.decoder.ff.LabelSubstitutionFF;
 import joshua.decoder.ff.OOVFF;
 import joshua.decoder.ff.PhraseModelFF;
 import joshua.decoder.ff.PhrasePenaltyFF;
 import joshua.decoder.ff.RuleFF;
 import joshua.decoder.ff.RuleLengthFF;
 import joshua.decoder.ff.SourcePathFF;
 import joshua.decoder.ff.WordPenaltyFF;
 import joshua.decoder.ff.fragmentlm.FragmentLMFF;
 import joshua.decoder.ff.lm.KenLMFF;
 import joshua.decoder.ff.lm.LanguageModelFF;
 import joshua.decoder.ff.lm.NGramLanguageModel;
 import joshua.decoder.ff.lm.berkeley_lm.LMGrammarBerkeley;
 import joshua.decoder.ff.lm.kenlm.jni.KenLM;
 import joshua.decoder.ff.phrase.DistortionFF;
 import joshua.decoder.ff.similarity.EdgePhraseSimilarityFF;
 import joshua.decoder.ff.tm.Grammar;
 import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
 import joshua.decoder.ff.tm.packed.PackedGrammar;
 import joshua.decoder.io.TranslationRequest;
 import joshua.decoder.phrase.PhraseTable;
 import joshua.decoder.segment_file.Sentence;
 import joshua.util.FileUtility;
 import joshua.util.Regex;
 import joshua.util.io.LineReader;

 /**
  * This class handles decoder initialization and the complication introduced by multithreading.
  *
  * After initialization, the main entry point to the Decoder object is
  * decodeAll(TranslationRequest), which returns a set of Translation objects wrapped in an iterable
  * Translations object. It is important that we support multithreading both (a) across the sentences
  * within a request and (b) across requests, in a round-robin fashion. This is done by maintaining a
  * fixed sized concurrent thread pool. When a new request comes in, a RequestHandler thread is
  * launched. This object reads iterates over the request's sentences, obtaining a thread from the
  * thread pool, and using that thread to decode the sentence. If a decoding thread is not available,
  * it will block until one is in a fair (FIFO) manner. This maintains fairness across requests so
  * long as each request only requests thread when it has a sentence ready.
  *
  * A decoding thread is handled by DecoderThread and launched from DecoderThreadRunner. The purpose
  * of the runner is to record where to place the translated sentence when it is done (i.e., which
  * Translations object). Translations itself is an iterator whose next() call blocks until the next
  * translation is available.
  *
  * @author Matt Post <post@cs.jhu.edu>
  * @author Zhifei Li, <zhifei.work@gmail.com>
  * @author wren ng thornton <wren@users.sourceforge.net>
  * @author Lane Schwartz <dowobeha@users.sourceforge.net>
  */
 public class Decoder {

   private final JoshuaConfiguration joshuaConfiguration;

   /*
    * Many of these objects themselves are global objects. We pass them in when constructing other
    * objects, so that they all share pointers to the same object. This is good because it reduces
    * overhead, but it can be problematic because of unseen dependencies (for example, in the
    * Vocabulary shared by language model, translation grammar, etc).
    */
   private List<Grammar> grammars;
   private ArrayList<FeatureFunction> featureFunctions;
   private ArrayList<NGramLanguageModel> languageModels;

   /* A sorted list of the feature names (so they can be output in the order they were read in) */
   public static ArrayList<String> feature_names = new ArrayList<String>();

   /* The feature weights. */
   public static FeatureVector weights;

   /** Logger for this class. */
   private static final Logger logger = Logger.getLogger(Decoder.class.getName());

   private BlockingQueue<DecoderThread> threadPool = null;

   public static boolean usingNonlocalFeatures = false;

   // ===============================================================
   // Constructors
   // ===============================================================


   /**
    * Constructor method that creates a new decoder using the specified configuration file.
    *
    * @param configFile Name of configuration file.
    */
   public Decoder(JoshuaConfiguration joshuaConfiguration, String configFile) {

     this(joshuaConfiguration);
     this.initialize(configFile);
   }

   /**
    * Factory method that creates a new decoder using the specified configuration file.
    *
    * @param configFile Name of configuration file.
    */
   public static Decoder createDecoder(String configFile) {
     JoshuaConfiguration joshuaConfiguration = new JoshuaConfiguration();
     return new Decoder(joshuaConfiguration, configFile);
   }

   /**
    * Constructs an uninitialized decoder for use in testing.
    * <p>
    * This method is private because it should only ever be called by the
    * {@link #getUninitalizedDecoder()} method to provide an uninitialized decoder for use in
    * testing.
    */
   private Decoder(JoshuaConfiguration joshuaConfiguration) {
     this.joshuaConfiguration = joshuaConfiguration;
     this.grammars = new ArrayList<Grammar>();
     this.threadPool = new ArrayBlockingQueue<DecoderThread>(
         this.joshuaConfiguration.num_parallel_decoders, true);
   }

   /**
    * Gets an uninitialized decoder for use in testing.
    * <p>
    * This method is called by unit tests or any outside packages (e.g., MERT) relying on the
    * decoder.
    */
   static public Decoder getUninitalizedDecoder(JoshuaConfiguration joshuaConfiguration) {
     return new Decoder(joshuaConfiguration);
   }

   // ===============================================================
   // Public Methods
   // ===============================================================

   /**
    * This class is responsible for getting sentences from the TranslationRequest and procuring a
    * DecoderThreadRunner to translate it. Each call to decodeAll(TranslationRequest) launches a
    * thread that will read the request's sentences, obtain a DecoderThread to translate them, and
    * then place the Translation in the appropriate place.
    *
    * @author Matt Post <post@cs.jhu.edu>
    *
    */
   private class RequestHandler extends Thread {
     /* Source of sentences to translate. */
     private final TranslationRequest request;

     /* Where to put translated sentences. */
     private final Translations response;

     RequestHandler(TranslationRequest request, Translations response) {
       this.request = request;
       this.response = response;
     }

     @Override
     public void run() {
       /*
        * Repeatedly get an input sentence, wait for a DecoderThread, and then start a new thread to
        * translate the sentence. We start a new thread (via DecoderRunnerThread) as opposed to
        * blocking, so that the RequestHandler can go on to the next sentence in this request, which
        * allows parallelization across the sentences of the request.
        */
       for (;;) {
         Sentence sentence = request.next();
         if (sentence == null) {
           response.finish();
           break;
         }

         // This will block until a DecoderThread becomes available.
         DecoderThread thread = Decoder.this.getThread();
         new DecoderThreadRunner(thread, sentence, response).start();
       }
     }
   }

   /**
    * Retrieve a thread from the thread pool, blocking until one is available. The blocking occurs in
    * a fair fashion (i.e,. FIFO across requests).
    *
    * @return a thread that can be used for decoding.
    */
   public DecoderThread getThread() {
     try {
       return threadPool.take();
     } catch (InterruptedException e) {
       // TODO Auto-generated catch block
       e.printStackTrace();
     }
     return null;
   }

   /**
    * This class handles running a DecoderThread (which takes care of the actual translation of an
    * input Sentence, returning a Translation object when its done). This is done in a thread so as
    * not to tie up the RequestHandler that launched it, freeing it to go on to the next sentence in
    * the TranslationRequest, in turn permitting parallelization across the sentences of a request.
    *
    * When the decoder thread is finshed, the Translation object is placed in the correct place in
    * the corresponding Translations object that was returned to the caller of
    * Decoder.decodeAll(TranslationRequest).
    *
    * @author Matt Post <post@cs.jhu.edu>
    */
   private class DecoderThreadRunner extends Thread {

     private final DecoderThread decoderThread;
     private final Sentence sentence;
     private final Translations translations;

     DecoderThreadRunner(DecoderThread thread, Sentence sentence, Translations translations) {
       this.decoderThread = thread;
       this.sentence = sentence;
       this.translations = translations;
     }

     @Override
     public void run() {
       /*
        * Use the thread to translate the sentence. Then record the translation with the
        * corresponding Translations object, and return the thread to the pool.
        */
       try {
         Translation translation = decoderThread.translate(this.sentence);
         translations.record(translation);

         /*
          * This is crucial! It's what makes the thread available for the next sentence to be
          * translated.
          */
         threadPool.put(decoderThread);
       } catch (InterruptedException e) {
         // TODO Auto-generated catch block
         System.err
             .println("* WARNING: I encountered an error trying to return the decoder thread.");
         e.printStackTrace();
       } catch (RuntimeException e) {
         System.err.println(String.format("* Decoder: fatal uncaught runtime exception on sentence %d: %s", sentence.id(), e.getMessage()));
         e.printStackTrace();
         System.exit(1);
       }
     }
   }

   /**
    * This function is the main entry point into the decoder. It translates all the sentences in a
    * (possibly boundless) set of input sentences. Each request launches its own thread to read the
    * sentences of the request.
    *
    * @param request
    * @return an iterable set of Translation objects
    */
   public Translations decodeAll(TranslationRequest request) {
     Translations translations = new Translations(request);

     new RequestHandler(request, translations).start();

     return translations;
   }

   /**
    * We can also just decode a single sentence.
    *
    * @param sentence
    * @return The translated sentence
    */
   public Translation decode(Sentence sentence) {
     // Get a thread.

     try {
       DecoderThread thread = threadPool.take();
       Translation translation = thread.translate(sentence);
       threadPool.put(thread);

       return translation;

     } catch (InterruptedException e) {
       e.printStackTrace();
     }

     return null;
   }

   public void cleanUp() {
     for (DecoderThread thread : threadPool) {
       try {
         thread.join();
       } catch (InterruptedException e) {
         // TODO Auto-generated catch block
         e.printStackTrace();
       }
     }
   }

   public static void writeConfigFile(double[] newWeights, String template, String outputFile,
       String newDiscriminativeModel) {
     try {
       int columnID = 0;

       BufferedWriter writer = FileUtility.getWriteFileStream(outputFile);
       LineReader reader = new LineReader(template);
       try {
         for (String line : reader) {
           line = line.trim();
           if (Regex.commentOrEmptyLine.matches(line) || line.indexOf("=") != -1) {
             // comment, empty line, or parameter lines: just copy
             writer.write(line);
             writer.newLine();

           } else { // models: replace the weight
             String[] fds = Regex.spaces.split(line);
             StringBuffer newSent = new StringBuffer();
             if (!Regex.floatingNumber.matches(fds[fds.length - 1])) {
               throw new IllegalArgumentException("last field is not a number; the field is: "
                   + fds[fds.length - 1]);
             }

             if (newDiscriminativeModel != null && "discriminative".equals(fds[0])) {
               newSent.append(fds[0]).append(' ');
               newSent.append(newDiscriminativeModel).append(' ');// change the file name
               for (int i = 2; i < fds.length - 1; i++) {
                 newSent.append(fds[i]).append(' ');
               }
             } else {// regular
               for (int i = 0; i < fds.length - 1; i++) {
                 newSent.append(fds[i]).append(' ');
               }
             }
             if (newWeights != null)
               newSent.append(newWeights[columnID++]);// change the weight
             else
               newSent.append(fds[fds.length - 1]);// do not change

             writer.write(newSent.toString());
             writer.newLine();
           }
         }
       } finally {
         reader.close();
         writer.close();
       }

       if (newWeights != null && columnID != newWeights.length) {
         throw new IllegalArgumentException("number of models does not match number of weights");
       }

     } catch (IOException e) {
       e.printStackTrace();
     }
   }

   // ===============================================================
   // Initialization Methods
   // ===============================================================

   /**
    * Initialize all parts of the JoshuaDecoder.
    *
    * @param configFile File containing configuration options
    * @return An initialized decoder
    */
   public Decoder initialize(String configFile) {
     try {

       long pre_load_time = System.currentTimeMillis();

       /*
        * Weights can be listed in a separate file (denoted by parameter "weights-file") or directly
        * in the Joshua config file. Config file values take precedent.
        */
       Decoder.weights = this.readWeights(joshuaConfiguration.weights_file);

       for (int i = 0; i < joshuaConfiguration.weights.size(); i++) {
         String pair[] = joshuaConfiguration.weights.get(i).split("\\s+");

         /* Sanity check for old-style unsupported feature invocations. */
         if (pair.length != 2) {
           System.err.println("FATAL: Invalid feature weight line found in config file.");
           System.err
               .println(String.format("The line was '%s'", joshuaConfiguration.weights.get(i)));
           System.err
               .println("You might be using an old version of the config file that is no longer supported");
           System.err
               .println("Check joshua-decoder.org or email joshua_support@googlegroups.com for help");
           System.exit(17);
         }

         feature_names.add(pair[0]);
         weights.put(pair[0], Float.parseFloat(pair[1]));
       }

       if (! weights.containsKey("BLEU"))
         Decoder.weights.put("BLEU", 0.0f);

       int num_dense = 0;
       for (String feature: feature_names)
         if (FeatureVector.isDense(feature))
             num_dense++;

       System.err.println(String.format("Read %d sparse and %d dense weights", weights.size() - num_dense, num_dense));

       // Do this before loading the grammars and the LM.
       this.featureFunctions = new ArrayList<FeatureFunction>();

       // Initialize and load grammars.
       this.initializeTranslationGrammars();
       logger.info(String.format("Grammar loading took: %d seconds.",
           (System.currentTimeMillis() - pre_load_time) / 1000));

       // Initialize the LM.
       initializeLanguageModels();

       // Initialize the features: requires that LM model has been initialized.
       this.initializeFeatureFunctions();

       // Sort the TM grammars (needed to do cube pruning)
       if (joshuaConfiguration.amortized_sorting) {
         logger.info("Grammar sorting happening lazily on-demand.");
       } else {
         long pre_sort_time = System.currentTimeMillis();
         for (Grammar grammar : this.grammars) {
           grammar.sortGrammar(this.featureFunctions);
         }
         logger.info(String.format("Grammar sorting took %d seconds.",
             (System.currentTimeMillis() - pre_sort_time) / 1000));
       }

       // Create the threads
       for (int i = 0; i < joshuaConfiguration.num_parallel_decoders; i++) {
         this.threadPool.put(new DecoderThread(this.grammars, Decoder.weights,
             this.featureFunctions, joshuaConfiguration));
       }

     } catch (IOException e) {
       e.printStackTrace();
     } catch (InterruptedException e) {
       // TODO Auto-generated catch block
       e.printStackTrace();
     }

     return this;
   }

   private void initializeLanguageModels() throws IOException {

     this.languageModels = new ArrayList<NGramLanguageModel>();

     // lm = kenlm 5 0 0 100 file
     for (String lmLine : joshuaConfiguration.lms) {

       logger.info("lm line: " + lmLine);

       String tokens[] = lmLine.split("\\s+");
       String lm_type = tokens[0];
       int lm_order = Integer.parseInt(tokens[1]);
       boolean minimizing = Boolean.parseBoolean(tokens[2]);
       String lm_file = tokens[5];

       if (lm_type.equals("kenlm")) {
         KenLM lm = new KenLM(lm_order, lm_file, minimizing);
         this.languageModels.add(lm);
         Vocabulary.registerLanguageModel(lm);
         Vocabulary.id(joshuaConfiguration.default_non_terminal);

       } else if (lm_type.equals("berkeleylm")) {
         LMGrammarBerkeley lm = new LMGrammarBerkeley(lm_order, lm_file);
         this.languageModels.add(lm);
         Vocabulary.registerLanguageModel(lm);
         Vocabulary.id(joshuaConfiguration.default_non_terminal);

       } else if (lm_type.equals("none")) {
         ; // do nothing

       } else {
         logger.warning("WARNING: using built-in language model; you probably didn't intend this");
         logger.warning("  Valid lm types are 'kenlm', 'berkeleylm', 'none'");
       }
     }

     for (int i = 0; i < this.languageModels.size(); i++) {
       NGramLanguageModel lm = this.languageModels.get(i);

       if (lm instanceof KenLM && lm.isMinimizing()) {
         this.featureFunctions.add(new KenLMFF(weights, String.format("lm_%d", i), (KenLM) lm));
       } else {
         this.featureFunctions.add(new LanguageModelFF(weights, String.format("lm_%d", i), lm));
       }
     }
   }

   private void initializeTranslationGrammars() throws IOException {

     if (joshuaConfiguration.tms.size() > 0) {

       // Records which PhraseModelFF's have been instantiated (one is needed for each owner).
       HashSet<String> ownersSeen = new HashSet<String>();

       // tm = {thrax/hiero,packed,samt} OWNER LIMIT FILE
       for (String tmLine : joshuaConfiguration.tms) {
         String tokens[] = tmLine.split("\\s+");
         String format = tokens[0];
         String owner = tokens[1];
         int span_limit = Integer.parseInt(tokens[2]);
         String file = tokens[3];

         Grammar grammar = null;
         if (format.equals("packed") || new File(file).isDirectory()) {
           try {
             grammar = new PackedGrammar(file, span_limit, owner,joshuaConfiguration);
           } catch (FileNotFoundException e) {
             System.err.println(String.format("Couldn't load packed grammar from '%s'", file));
             System.err.println("Perhaps it doesn't exist, or it may be an old packed file format.");
             System.exit(2);
           }

         } else if (format.equals("phrase")) {

           joshuaConfiguration.phrase_based = true;
           grammar = new PhraseTable(file, owner, joshuaConfiguration, featureFunctions);

         } else {
           // thrax, hiero, samt
           grammar = new MemoryBasedBatchGrammar(format, file, owner,
               joshuaConfiguration.default_non_terminal, span_limit, joshuaConfiguration);
         }

         this.grammars.add(grammar);

         // Record the owner so we can create a feature function for her.
         ownersSeen.add(owner);
       }

       /*
        * Create and add a feature function for this owner, the first time we see each owner.
        *
        * Warning! This needs to be done *after* initializing the grammars, in case there is a packed
        * grammar, since it resets the vocabulary.
        */
       for (String owner : ownersSeen) {
         this.featureFunctions.add(new PhraseModelFF(weights, owner));
       }

     } else {
       logger.warning("* WARNING: no grammars supplied!  Supplying dummy glue grammar.");
       // TODO: this should initialize the grammar dynamically so that the goal symbol and default
       // non terminal match
       MemoryBasedBatchGrammar glueGrammar = new MemoryBasedBatchGrammar("thrax", String.format(
           "%s/data/glue-grammar", System.getenv().get("JOSHUA")), "glue",
           joshuaConfiguration.default_non_terminal, -1, joshuaConfiguration);
       this.grammars.add(glueGrammar);
     }

     logger.info(String.format("Memory used %.1f MB", ((Runtime.getRuntime().totalMemory() - Runtime
         .getRuntime().freeMemory()) / 1000000.0)));
   }

   /*
    * This function reads the weights for the model. Feature names and their weights are listed one
    * per line in the following format:
    *
    * FEATURE_NAME WEIGHT
    */
   private FeatureVector readWeights(String fileName) {
     FeatureVector weights = new FeatureVector();

     if (fileName.equals(""))
       return new FeatureVector();

     try {
       LineReader lineReader = new LineReader(fileName);

       for (String line : lineReader) {
         line = line.replaceAll("\\s+", " ");

         if (line.equals("") || line.startsWith("#") || line.startsWith("//")
             || line.indexOf(' ') == -1)
           continue;

         String tokens[] = line.split("\\s+");
         String feature = tokens[0];
         Float value = Float.parseFloat(tokens[1]);

         weights.put(feature, value);
         feature_names.add(feature);
       }
     } catch (FileNotFoundException ioe) {
       System.err.println("* FATAL: Can't find weights-file '" + fileName + "'");
       System.exit(1);
     } catch (IOException ioe) {
       System.err.println("* FATAL: Can't read weights-file '" + fileName + "'");
       ioe.printStackTrace();
       System.exit(1);
     }

     logger.info(String.format("Read %d weights from file '%s'", weights.size(), fileName));

     return weights;
   }

   /**
    * Feature functions are instantiated with a line of the form
    *
    * <pre>
    *   feature_function = FEATURE OPTIONS
    * </pre>
    *
    * Weights for features are listed separately.
    *
    */
   private void initializeFeatureFunctions() {

     usingNonlocalFeatures = true;

     for (String featureLine : joshuaConfiguration.features) {

       // Get rid of the leading crap.
       featureLine = featureLine.replaceFirst("^feature_function\\s*=\\s*", "");

       String fields[] = featureLine.split("\\s+");
       String featureName = fields[0];
       String feature = featureName.toLowerCase();

       if (feature.equals("latticecost") || feature.equals("sourcepath")) {
         this.featureFunctions.add(new SourcePathFF(Decoder.weights));
       }

       else if (feature.equals("arityphrasepenalty") || feature.equals("aritypenalty")) {
         String owner = fields[1];
         int startArity = Integer.parseInt(fields[2].trim());
         int endArity = Integer.parseInt(fields[3].trim());

         this.featureFunctions.add(new ArityPhrasePenaltyFF(weights, String.format("%s %d %d",
             owner, startArity, endArity)));

       } else if (feature.equals("wordpenalty")) {
         this.featureFunctions.add(new WordPenaltyFF(weights));

       } else if (feature.equals("oovpenalty")) {
         this.featureFunctions.add(new OOVFF(weights));

       } else if (feature.equals("rulelength")) {
         this.featureFunctions.add(new RuleLengthFF(weights));

       } else if (feature.equals("edgephrasesimilarity")) {
         String host = fields[1].trim();
         int port = Integer.parseInt(fields[2].trim());

         try {
           this.featureFunctions.add(new EdgePhraseSimilarityFF(weights, host, port));

         } catch (Exception e) {
           e.printStackTrace();
           System.exit(1);
         }

       } else if (feature.equals("phrasemodel") || feature.equals("tm")) {
         String owner = fields[1].trim();
         String index = fields[2].trim();
         Float weight = Float.parseFloat(fields[3]);

         weights.put(String.format("tm_%s_%s", owner, index), weight);

       } else if (feature.equals("fragmentlm")) {
 //        logger.info(String.format("FEATURE: FragmentLMFF %s", featureLine));
         this.featureFunctions.add(new FragmentLMFF(Decoder.weights, featureLine));

       } else if (feature.equals("rule")) {
 //        logger.info(String.format("FEATURE: RuleFF %s", featureLine));
         this.featureFunctions.add(new RuleFF(Decoder.weights, featureLine));

       } else if (feature.equals("phrasepenalty")) {
         this.featureFunctions.add(new PhrasePenaltyFF(Decoder.weights, featureLine));

       } else if (feature.equals(LabelCombinationFF.getLowerCasedFeatureName())) {
         this.featureFunctions.add(new LabelCombinationFF(weights));

       } else if (feature.equals(LabelSubstitutionFF.getLowerCasedFeatureName())) {
         this.featureFunctions.add(new LabelSubstitutionFF(weights));

       } else if (feature.equals("distortion")) {
         this.featureFunctions.add(new DistortionFF(weights));

       } else {
         try {
           Class<?> clas = Class.forName(String.format("joshua.decoder.ff.%sFF", featureName));
           Constructor<?> constructor = clas.getConstructor(FeatureVector.class, String[].class);
           this.featureFunctions.add((FeatureFunction) constructor.newInstance(weights, fields));
         } catch (Exception e) {
           e.printStackTrace();
           System.err.println("* WARNING: invalid feature '" + featureLine + "'");
           System.exit(1);
         }
       }
     }

     for (FeatureFunction feature: featureFunctions) {
       logger.info(String.format("FEATURE: %s", feature.logString()));
     }
   }
 }