src/joshua/decoder/Decoder.java - joshua - Git at Google

 package joshua.decoder;

 import java.io.BufferedWriter;
 import java.io.File;
 import java.io.IOException;
 import java.io.FileNotFoundException;
 import java.lang.reflect.Constructor;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;

 import joshua.corpus.Vocabulary;
 import joshua.decoder.ff.FeatureVector;
 import joshua.decoder.ff.FeatureFunction;
 import joshua.decoder.ff.PhraseModel;
 import joshua.decoder.ff.tm.Grammar;
 import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.ff.tm.format.HieroFormatReader;
 import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
 import joshua.decoder.ff.tm.packed.PackedGrammar;
 import joshua.decoder.io.TranslationRequest;
 import joshua.decoder.phrase.PhraseTable;
 import joshua.decoder.segment_file.Sentence;
 import joshua.util.FileUtility;
 import joshua.util.FormatUtils;
 import joshua.util.Regex;
 import joshua.util.io.LineReader;

 /**
  * This class handles decoder initialization and the complication introduced by multithreading.
  *
  * After initialization, the main entry point to the Decoder object is
  * decodeAll(TranslationRequest), which returns a set of Translation objects wrapped in an iterable
  * Translations object. It is important that we support multithreading both (a) across the sentences
  * within a request and (b) across requests, in a round-robin fashion. This is done by maintaining a
  * fixed sized concurrent thread pool. When a new request comes in, a RequestHandler thread is
  * launched. This object reads iterates over the request's sentences, obtaining a thread from the
  * thread pool, and using that thread to decode the sentence. If a decoding thread is not available,
  * it will block until one is in a fair (FIFO) manner. This maintains fairness across requests so
  * long as each request only requests thread when it has a sentence ready.
  *
  * A decoding thread is handled by DecoderThread and launched from DecoderThreadRunner. The purpose
  * of the runner is to record where to place the translated sentence when it is done (i.e., which
  * Translations object). Translations itself is an iterator whose next() call blocks until the next
  * translation is available.
  *
  * @author Matt Post <post@cs.jhu.edu>
  * @author Zhifei Li, <zhifei.work@gmail.com>
  * @author wren ng thornton <wren@users.sourceforge.net>
  * @author Lane Schwartz <dowobeha@users.sourceforge.net>
  */
 public class Decoder {

   private final JoshuaConfiguration joshuaConfiguration;

   /*
    * Many of these objects themselves are global objects. We pass them in when constructing other
    * objects, so that they all share pointers to the same object. This is good because it reduces
    * overhead, but it can be problematic because of unseen dependencies (for example, in the
    * Vocabulary shared by language model, translation grammar, etc).
    */
   private List<Grammar> grammars;
   private ArrayList<FeatureFunction> featureFunctions;

   /* The feature weights. */
   public static FeatureVector weights;

   public static int VERBOSE = 1;

   private BlockingQueue<DecoderThread> threadPool = null;

   // ===============================================================
   // Constructors
   // ===============================================================

   /**
    * Constructor method that creates a new decoder using the specified configuration file.
    *
    * @param configFile Name of configuration file.
    */
   public Decoder(JoshuaConfiguration joshuaConfiguration, String configFile) {

     this(joshuaConfiguration);
     this.initialize(configFile);
   }

   /**
    * Factory method that creates a new decoder using the specified configuration file.
    *
    * @param configFile Name of configuration file.
    */
   public static Decoder createDecoder(String configFile) {
     JoshuaConfiguration joshuaConfiguration = new JoshuaConfiguration();
     return new Decoder(joshuaConfiguration, configFile);
   }

   /**
    * Constructs an uninitialized decoder for use in testing.
    * <p>
    * This method is private because it should only ever be called by the
    * {@link #getUninitalizedDecoder()} method to provide an uninitialized decoder for use in
    * testing.
    */
   private Decoder(JoshuaConfiguration joshuaConfiguration) {
     this.joshuaConfiguration = joshuaConfiguration;
     this.grammars = new ArrayList<Grammar>();
     this.threadPool = new ArrayBlockingQueue<DecoderThread>(
         this.joshuaConfiguration.num_parallel_decoders, true);
   }

   /**
    * Gets an uninitialized decoder for use in testing.
    * <p>
    * This method is called by unit tests or any outside packages (e.g., MERT) relying on the
    * decoder.
    */
   static public Decoder getUninitalizedDecoder(JoshuaConfiguration joshuaConfiguration) {
     return new Decoder(joshuaConfiguration);
   }

   // ===============================================================
   // Public Methods
   // ===============================================================

   /**
    * This class is responsible for getting sentences from the TranslationRequest and procuring a
    * DecoderThreadRunner to translate it. Each call to decodeAll(TranslationRequest) launches a
    * thread that will read the request's sentences, obtain a DecoderThread to translate them, and
    * then place the Translation in the appropriate place.
    *
    * @author Matt Post <post@cs.jhu.edu>
    *
    */
   private class RequestHandler extends Thread {
     /* Source of sentences to translate. */
     private final TranslationRequest request;

     /* Where to put translated sentences. */
     private final Translations response;

     RequestHandler(TranslationRequest request, Translations response) {
       this.request = request;
       this.response = response;
     }

     @Override
     public void run() {
       /*
        * Repeatedly get an input sentence, wait for a DecoderThread, and then start a new thread to
        * translate the sentence. We start a new thread (via DecoderRunnerThread) as opposed to
        * blocking, so that the RequestHandler can go on to the next sentence in this request, which
        * allows parallelization across the sentences of the request.
        */
       for (;;) {
         Sentence sentence = request.next();
         if (sentence == null) {
           response.finish();
           break;
         }

         // This will block until a DecoderThread becomes available.
         DecoderThread thread = Decoder.this.getThread();
         new DecoderThreadRunner(thread, sentence, response).start();
       }
     }
   }

   /**
    * Retrieve a thread from the thread pool, blocking until one is available. The blocking occurs in
    * a fair fashion (i.e,. FIFO across requests).
    *
    * @return a thread that can be used for decoding.
    */
   public DecoderThread getThread() {
     try {
       return threadPool.take();
     } catch (InterruptedException e) {
       // TODO Auto-generated catch block
       e.printStackTrace();
     }
     return null;
   }

   /**
    * This class handles running a DecoderThread (which takes care of the actual translation of an
    * input Sentence, returning a Translation object when its done). This is done in a thread so as
    * not to tie up the RequestHandler that launched it, freeing it to go on to the next sentence in
    * the TranslationRequest, in turn permitting parallelization across the sentences of a request.
    *
    * When the decoder thread is finshed, the Translation object is placed in the correct place in
    * the corresponding Translations object that was returned to the caller of
    * Decoder.decodeAll(TranslationRequest).
    *
    * @author Matt Post <post@cs.jhu.edu>
    */
   private class DecoderThreadRunner extends Thread {

     private final DecoderThread decoderThread;
     private final Sentence sentence;
     private final Translations translations;

     DecoderThreadRunner(DecoderThread thread, Sentence sentence, Translations translations) {
       this.decoderThread = thread;
       this.sentence = sentence;
       this.translations = translations;
     }

     @Override
     public void run() {
       /*
        * Use the thread to translate the sentence. Then record the translation with the
        * corresponding Translations object, and return the thread to the pool.
        */
       try {
         Translation translation = decoderThread.translate(this.sentence);
         translations.record(translation);

         /*
          * This is crucial! It's what makes the thread available for the next sentence to be
          * translated.
          */
         threadPool.put(decoderThread);
       } catch (Exception e) {
         System.err.println(String.format(
             "Input %d: FATAL UNCAUGHT EXCEPTION: %s", sentence.id(), e.getMessage()));
         e.printStackTrace();
         System.exit(1);;
 //        translations.record(new Translation(sentence, null, featureFunctions, joshuaConfiguration));
       }
     }
   }

   /**
    * This function is the main entry point into the decoder. It translates all the sentences in a
    * (possibly boundless) set of input sentences. Each request launches its own thread to read the
    * sentences of the request.
    *
    * @param request
    * @return an iterable set of Translation objects
    */
   public Translations decodeAll(TranslationRequest request) {
     Translations translations = new Translations(request);

     new RequestHandler(request, translations).start();

     return translations;
   }

   /**
    * We can also just decode a single sentence.
    *
    * @param sentence
    * @return The translated sentence
    */
   public Translation decode(Sentence sentence) {
     // Get a thread.

     try {
       DecoderThread thread = threadPool.take();
       Translation translation = thread.translate(sentence);
       threadPool.put(thread);

       return translation;

     } catch (InterruptedException e) {
       e.printStackTrace();
     }

     return null;
   }

   public void cleanUp() {
     for (DecoderThread thread : threadPool) {
       try {
         thread.join();
       } catch (InterruptedException e) {
         // TODO Auto-generated catch block
         e.printStackTrace();
       }
     }
   }

   public static void writeConfigFile(double[] newWeights, String template, String outputFile,
       String newDiscriminativeModel) {
     try {
       int columnID = 0;

       BufferedWriter writer = FileUtility.getWriteFileStream(outputFile);
       LineReader reader = new LineReader(template);
       try {
         for (String line : reader) {
           line = line.trim();
           if (Regex.commentOrEmptyLine.matches(line) || line.indexOf("=") != -1) {
             // comment, empty line, or parameter lines: just copy
             writer.write(line);
             writer.newLine();

           } else { // models: replace the weight
             String[] fds = Regex.spaces.split(line);
             StringBuffer newSent = new StringBuffer();
             if (!Regex.floatingNumber.matches(fds[fds.length - 1])) {
               throw new IllegalArgumentException("last field is not a number; the field is: "
                   + fds[fds.length - 1]);
             }

             if (newDiscriminativeModel != null && "discriminative".equals(fds[0])) {
               newSent.append(fds[0]).append(' ');
               newSent.append(newDiscriminativeModel).append(' ');// change the
                                                                  // file name
               for (int i = 2; i < fds.length - 1; i++) {
                 newSent.append(fds[i]).append(' ');
               }
             } else {// regular
               for (int i = 0; i < fds.length - 1; i++) {
                 newSent.append(fds[i]).append(' ');
               }
             }
             if (newWeights != null)
               newSent.append(newWeights[columnID++]);// change the weight
             else
               newSent.append(fds[fds.length - 1]);// do not change

             writer.write(newSent.toString());
             writer.newLine();
           }
         }
       } finally {
         reader.close();
         writer.close();
       }

       if (newWeights != null && columnID != newWeights.length) {
         throw new IllegalArgumentException("number of models does not match number of weights");
       }

     } catch (IOException e) {
       e.printStackTrace();
     }
   }

   // ===============================================================
   // Initialization Methods
   // ===============================================================

   /**
    * Moses requires the pattern .*_.* for sparse features, and prohibits underscores in dense features.
    * This conforms to that pattern. We assume non-conforming dense features start with tm_ or lm_,
    * and the only sparse feature that needs converting is OOVPenalty.
    *
    * @param feature
    * @return the feature in Moses format
    */
   private String mosesize(String feature) {
     if (joshuaConfiguration.moses) {
       if (feature.startsWith("tm_") || feature.startsWith("lm_"))
         return feature.replace("_", "-");
     }

     return feature;
   }

   /**
    * Initialize all parts of the JoshuaDecoder.
    *
    * @param configFile File containing configuration options
    * @return An initialized decoder
    */
   public Decoder initialize(String configFile) {
     try {

       long pre_load_time = System.currentTimeMillis();

       /* Weights can be listed in a separate file (denoted by parameter "weights-file") or directly
        * in the Joshua config file. Config file values take precedent.
        */
       this.readWeights(joshuaConfiguration.weights_file);


       /* Add command-line-passed weights to the weights array for processing below */
       if (joshuaConfiguration.weight_overwrite != "") {
         String[] tokens = joshuaConfiguration.weight_overwrite.split("\\s+");
         for (int i = 0; i < tokens.length; i += 2) {
           String feature = tokens[i];
           float value = Float.parseFloat(tokens[i+1]);

           if (joshuaConfiguration.moses)
             feature = demoses(feature);

           joshuaConfiguration.weights.add(String.format("%s %s", feature, tokens[i+1]));
           Decoder.LOG(1, String.format("COMMAND LINE WEIGHT: %s -> %.3f", feature, value));
         }
       }

       /* Read the weights found in the config file */
       for (String pairStr: joshuaConfiguration.weights) {
         String pair[] = pairStr.split("\\s+");

         /* Sanity check for old-style unsupported feature invocations. */
         if (pair.length != 2) {
           System.err.println("FATAL: Invalid feature weight line found in config file.");
           System.err
               .println(String.format("The line was '%s'", pairStr));
           System.err
               .println("You might be using an old version of the config file that is no longer supported");
           System.err
               .println("Check joshua-decoder.org or email joshua_support@googlegroups.com for help");
           System.exit(17);
         }

         weights.set(pair[0], Float.parseFloat(pair[1]));
       }

       Decoder.LOG(1, String.format("Read %d weights (%d of them dense)", weights.size(),
           weights.DENSE_FEATURE_NAMES.size()));

       // Do this before loading the grammars and the LM.
       this.featureFunctions = new ArrayList<FeatureFunction>();

       // Initialize and load grammars. This must happen first, since the vocab gets defined by
       // the packed grammar (if any)
       this.initializeTranslationGrammars();

       Decoder.LOG(1, String.format("Grammar loading took: %d seconds.",
           (System.currentTimeMillis() - pre_load_time) / 1000));

       // Initialize the features: requires that LM model has been initialized.
       this.initializeFeatureFunctions();

       // This is mostly for compatibility with the Moses tuning script
       if (joshuaConfiguration.show_weights_and_quit) {
         for (int i = 0; i < weights.DENSE_FEATURE_NAMES.size(); i++) {
           String name = weights.DENSE_FEATURE_NAMES.get(i);
           if (joshuaConfiguration.moses)
             System.out.println(String.format("%s= %.5f", mosesize(name), weights.getDense(i)));
           else
             System.out.println(String.format("%s %.5f", name, weights.getDense(i)));
         }
         System.exit(0);
       }

       // Sort the TM grammars (needed to do cube pruning)
       if (joshuaConfiguration.amortized_sorting) {
         Decoder.LOG(1, "Grammar sorting happening lazily on-demand.");
       } else {
         long pre_sort_time = System.currentTimeMillis();
         for (Grammar grammar : this.grammars) {
           grammar.sortGrammar(this.featureFunctions);
         }
         Decoder.LOG(1, String.format("Grammar sorting took %d seconds.",
             (System.currentTimeMillis() - pre_sort_time) / 1000));
       }

       // Create the threads
       for (int i = 0; i < joshuaConfiguration.num_parallel_decoders; i++) {
         this.threadPool.put(new DecoderThread(this.grammars, Decoder.weights,
             this.featureFunctions, joshuaConfiguration));
       }

     } catch (IOException e) {
       e.printStackTrace();
     } catch (InterruptedException e) {
       // TODO Auto-generated catch block
       e.printStackTrace();
     }

     return this;
   }

   /**
    * Initializes translation grammars Retained for backward compatibility
    *
    * @param ownersSeen Records which PhraseModelFF's have been instantiated (one is needed for each
    *          owner)
    * @throws IOException
    */
   private void initializeTranslationGrammars() throws IOException {

     if (joshuaConfiguration.tms.size() > 0) {

       // collect packedGrammars to check if they use a shared vocabulary
       final List<PackedGrammar> packed_grammars = new ArrayList<>();

       // tm = {thrax/hiero,packed,samt,moses} OWNER LIMIT FILE
       for (String tmLine : joshuaConfiguration.tms) {

         String type = tmLine.substring(0,  tmLine.indexOf(' '));
         String[] args = tmLine.substring(tmLine.indexOf(' ')).trim().split("\\s+");
         HashMap<String, String> parsedArgs = FeatureFunction.parseArgs(args);

         String owner = parsedArgs.get("owner");
         int span_limit = Integer.parseInt(parsedArgs.get("maxspan"));
         String path = parsedArgs.get("path");

         Grammar grammar = null;
         if (! type.equals("moses") && ! type.equals("phrase")) {
           if (new File(path).isDirectory()) {
             try {
               PackedGrammar packed_grammar = new PackedGrammar(path, span_limit, owner, type, joshuaConfiguration);
               packed_grammars.add(packed_grammar);
               grammar = packed_grammar;
             } catch (FileNotFoundException e) {
               System.err.println(String.format("Couldn't load packed grammar from '%s'", path));
               System.err.println("Perhaps it doesn't exist, or it may be an old packed file format.");
               System.exit(2);
             }
           } else {
             // thrax, hiero, samt
             grammar = new MemoryBasedBatchGrammar(type, path, owner,
                 joshuaConfiguration.default_non_terminal, span_limit, joshuaConfiguration);
           }

         } else {

           int maxSourceLen = parsedArgs.containsKey("max-source-len")
               ? Integer.parseInt(parsedArgs.get("max-source-len"))
               : -1;

           joshuaConfiguration.search_algorithm = "stack";
           grammar = new PhraseTable(path, owner, type, joshuaConfiguration, maxSourceLen);
         }

         this.grammars.add(grammar);
       }

       checkSharedVocabularyChecksumsForPackedGrammars(packed_grammars);

     } else {
       Decoder.LOG(1, "* WARNING: no grammars supplied!  Supplying dummy glue grammar.");
       MemoryBasedBatchGrammar glueGrammar = new MemoryBasedBatchGrammar("glue", joshuaConfiguration);
       glueGrammar.setSpanLimit(-1);
       glueGrammar.addGlueRules(featureFunctions);
       this.grammars.add(glueGrammar);
     }

     /* Create an epsilon-deleting grammar */
     if (joshuaConfiguration.lattice_decoding) {
       Decoder.LOG(1, "Creating an epsilon-deleting grammar");
       MemoryBasedBatchGrammar latticeGrammar = new MemoryBasedBatchGrammar("lattice", joshuaConfiguration);
       latticeGrammar.setSpanLimit(-1);
       HieroFormatReader reader = new HieroFormatReader();

       String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
       String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);

       String ruleString = String.format("[%s] ||| [%s,1] <eps> ||| [%s,1] ||| ", goalNT, goalNT, defaultNT,
           goalNT, defaultNT);

       Rule rule = reader.parseLine(ruleString);
       latticeGrammar.addRule(rule);
       rule.estimateRuleCost(featureFunctions);

       this.grammars.add(latticeGrammar);
     }

     /* Now create a feature function for each owner */
     HashSet<String> ownersSeen = new HashSet<String>();

     for (Grammar grammar: this.grammars) {
       String owner = Vocabulary.word(grammar.getOwner());
       if (! ownersSeen.contains(owner)) {
         this.featureFunctions.add(new PhraseModel(weights, new String[] { "tm", "-owner", owner },
             joshuaConfiguration, grammar));
         ownersSeen.add(owner);
       }
     }

     Decoder.LOG(1, String.format("Memory used %.1f MB",
         ((Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0)));
   }

   /**
    * Checks if multiple packedGrammars have the same vocabulary by comparing their vocabulary file checksums.
    */
   private static void checkSharedVocabularyChecksumsForPackedGrammars(final List<PackedGrammar> packed_grammars) {
     String previous_checksum = "";
     for (PackedGrammar grammar : packed_grammars) {
       final String checksum = grammar.computeVocabularyChecksum();
       if (previous_checksum.isEmpty()) {
         previous_checksum = checksum;
       } else {
         if (!checksum.equals(previous_checksum)) {
           throw new RuntimeException(
               "Trying to load multiple packed grammars with different vocabularies!" +
               "Have you packed them jointly?");
         }
         previous_checksum = checksum;
       }
     }
   }

   /*
    * This function reads the weights for the model. Feature names and their weights are listed one
    * per line in the following format:
    *
    * FEATURE_NAME WEIGHT
    */
   private void readWeights(String fileName) {
     Decoder.weights = new FeatureVector();

     if (fileName.equals(""))
       return;

     try {
       LineReader lineReader = new LineReader(fileName);

       for (String line : lineReader) {
         line = line.replaceAll("\\s+", " ");

         if (line.equals("") || line.startsWith("#") || line.startsWith("//")
             || line.indexOf(' ') == -1)
           continue;

         String tokens[] = line.split("\\s+");
         String feature = tokens[0];
         Float value = Float.parseFloat(tokens[1]);

         // Kludge for compatibility with Moses tuners
         if (joshuaConfiguration.moses) {
           feature = demoses(feature);
         }

         weights.increment(feature, value);
       }
     } catch (FileNotFoundException ioe) {
       System.err.println("* FATAL: Can't find weights-file '" + fileName + "'");
       System.exit(1);
     } catch (IOException ioe) {
       System.err.println("* FATAL: Can't read weights-file '" + fileName + "'");
       ioe.printStackTrace();
       System.exit(1);
     }

     Decoder.LOG(1, String.format("Read %d weights from file '%s'", weights.size(), fileName));
   }

   private String demoses(String feature) {
     if (feature.endsWith("="))
       feature = feature.replace("=", "");
     if (feature.equals("OOV_Penalty"))
       feature = "OOVPenalty";
     else if (feature.startsWith("tm-") || feature.startsWith("lm-"))
       feature = feature.replace("-",  "_");
     return feature;
   }

   /**
    * Feature functions are instantiated with a line of the form
    *
    * <pre>
    *   feature_function = FEATURE OPTIONS
    * </pre>
    *
    * Weights for features are listed separately.
    *
    * @param tmOwnersSeen
    * @throws IOException
    *
    */
   private void initializeFeatureFunctions() throws IOException {

     for (String featureLine : joshuaConfiguration.features) {
       // feature-function = NAME args
       // 1. create new class named NAME, pass it config, weights, and the args

       // Get rid of the leading crap.
       featureLine = featureLine.replaceFirst("^feature_function\\s*=\\s*", "");

       String fields[] = featureLine.split("\\s+");
       String featureName = fields[0];
       try {
         Class<?> clas = getClass(featureName);
         Constructor<?> constructor = clas.getConstructor(FeatureVector.class,
             String[].class, JoshuaConfiguration.class);
         this.featureFunctions.add((FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration));
       } catch (Exception e) {
         e.printStackTrace();
         System.err.println("* FATAL: could not find a feature '" + featureName + "'");
         System.exit(1);
       }
     }

     for (FeatureFunction feature : featureFunctions) {
       Decoder.LOG(1, String.format("FEATURE: %s", feature.logString()));

     }

     weights.registerDenseFeatures(featureFunctions);
   }

   /**
    * Searches a list of predefined paths for classes, and returns the first one found. Meant for
    * instantiating feature functions.
    *
    * @param name
    * @return the class, found in one of the search paths
    * @throws ClassNotFoundException
    */
   private Class<?> getClass(String featureName) {
     Class<?> clas = null;
     String[] packages = { "joshua.decoder.ff", "joshua.decoder.ff.lm", "joshua.decoder.ff.phrase" };
     for (String path : packages) {
       try {
         clas = Class.forName(String.format("%s.%s", path, featureName));
         break;
       } catch (ClassNotFoundException e) {
         try {
           clas = Class.forName(String.format("%s.%sFF", path, featureName));
           break;
         } catch (ClassNotFoundException e2) {
           // do nothing
         }
       }
     }
     return clas;
   }

   public static boolean VERBOSE(int i) {
     return i <= VERBOSE;
   }

   public static void LOG(int i, String msg) {
     if (VERBOSE(i))
       System.err.println(msg);
   }
 }