src/joshua/decoder/ff/tm/packed/PackedGrammar.java - joshua - Git at Google

 package joshua.decoder.ff.tm.packed;

 /***
  * This package implements Joshua's packed grammar structure, which enables the efficient loading
  * and accessing of grammars. It is described in the paper:
  *
  * @article{ganitkevitch2012joshua,
  *   Author = {Ganitkevitch, J. and Cao, Y. and Weese, J. and Post, M. and Callison-Burch, C.},
  *   Journal = {Proceedings of WMT12},
  *   Title = {Joshua 4.0: Packing, PRO, and paraphrases},
  *   Year = {2012}}
  *
  * The packed grammar works by compiling out the grammar tries into a compact format that is loaded
  * and parsed directly from Java arrays. A fundamental problem is that Java arrays are indexed
  * by ints and not longs, meaning the maximum size of the packed grammar is about 2 GB. This forces
  * the use of packed grammar slices, which together constitute the grammar. The figure in the
  * paper above shows what each slice looks like.
  *
  * The division across slices is done in a depth-first manner. Consider the entire grammar organized
  * into a single source-side trie. The splits across tries are done by grouping the root-level
  * outgoing trie arcs --- and the entire trie beneath them --- across slices.
  *
  * This presents a problem: if the subtree rooted beneath a single top-level arc is too big for a
  * slice, the grammar can't be packed. This happens with very large Hiero grammars, for example,
  * where there are a *lot* of rules that start with [X].
  *
  * A solution being worked on is to split that symbol and pack them into separate grammars with a
  * shared vocabulary, and then rely on Joshua's ability to query multiple grammars for rules to
  * solve this problem. This is not currently implemented but could be done directly in the
  * Grammar Packer.
  */

 import java.io.BufferedInputStream;
 import java.io.DataInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.nio.IntBuffer;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.channels.FileChannel.MapMode;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;

 import joshua.corpus.Vocabulary;
 import joshua.decoder.Decoder;
 import joshua.decoder.JoshuaConfiguration;
 import joshua.decoder.ff.FeatureFunction;
 import joshua.decoder.ff.FeatureVector;
 import joshua.decoder.ff.tm.AbstractGrammar;
 import joshua.decoder.ff.tm.BasicRuleCollection;
 import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.ff.tm.RuleCollection;
 import joshua.decoder.ff.tm.Trie;
 import joshua.decoder.ff.tm.hash_based.ExtensionIterator;
 import joshua.util.encoding.EncoderConfiguration;
 import joshua.util.encoding.FloatEncoder;
 import joshua.util.io.LineReader;

 public class PackedGrammar extends AbstractGrammar {

   private EncoderConfiguration encoding;

   private PackedRoot root;
   private ArrayList<PackedSlice> slices;

   // The grammar specification keyword (e.g., "thrax" or "moses")
   private String type;

   public PackedGrammar(String grammar_dir, int span_limit, String owner, String type,
       JoshuaConfiguration joshuaConfiguration) throws FileNotFoundException, IOException {
     super(joshuaConfiguration);
     this.spanLimit = span_limit;
     this.type = type;

     // Read the vocabulary.
     String vocabFile = grammar_dir + File.separator + "vocabulary";
     Decoder.LOG(1, String.format("Reading vocabulary: %s", vocabFile));
     Vocabulary.read(vocabFile);

     // Read the config
     String configFile = grammar_dir + File.separator + "config";
     if (new File(configFile).exists()) {
       Decoder.LOG(1, String.format("Reading packed config: %s", configFile));
       readConfig(configFile);
     }

     // Read the quantizer setup.
     Decoder.LOG(1, String.format("Reading encoder configuration: %s%sencoding", grammar_dir, File.separator));
     encoding = new EncoderConfiguration();
     encoding.load(grammar_dir + File.separator + "encoding");

     // Set phrase owner.
     this.owner = Vocabulary.id(owner);

     String[] listing = new File(grammar_dir).list();
     slices = new ArrayList<PackedSlice>();
     for (int i = 0; i < listing.length; i++) {
       if (listing[i].startsWith("slice_") && listing[i].endsWith(".source"))
         slices.add(new PackedSlice(grammar_dir + File.separator + listing[i].substring(0, 11)));
     }

     long count = 0;
     for (PackedSlice s : slices)
       count += s.estimated.length;
     root = new PackedRoot(this);

     Decoder.LOG(1, String.format("Loaded %d rules", count));
   }

   @Override
   public Trie getTrieRoot() {
     return root;
   }

   @Override
   public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
     return (spanLimit == -1 || pathLength <= spanLimit);
   }

   @Override
   public int getNumRules() {
     int num_rules = 0;
     for (PackedSlice ps : slices)
       num_rules += ps.featureSize;
     return num_rules;
   }

   public Rule constructManualRule(int lhs, int[] src, int[] tgt, float[] scores, int arity) {
     return null;
   }

   public final class PackedRoot implements Trie {

     private HashMap<Integer, PackedSlice> lookup;

     public PackedRoot(PackedGrammar grammar) {
       lookup = new HashMap<Integer, PackedSlice>();

       for (PackedSlice ps : grammar.slices) {
         int num_children = ps.source[0];
         for (int i = 0; i < num_children; i++)
           lookup.put(ps.source[2 * i + 1], ps);
       }
     }

     @Override
     public Trie match(int word_id) {
       PackedSlice ps = lookup.get(word_id);
       if (ps != null)
         return ps.root().match(word_id);
       return null;
     }

     @Override
     public boolean hasExtensions() {
       return !lookup.isEmpty();
     }

     @Override
     public HashMap<Integer, ? extends Trie> getChildren() {
       HashMap<Integer, Trie> children = new HashMap<Integer, Trie>();
       for (int key : lookup.keySet())
         children.put(key, match(key));
       return children;
     }

     @Override
     public ArrayList<? extends Trie> getExtensions() {
       ArrayList<Trie> tries = new ArrayList<Trie>();
       for (int key : lookup.keySet()) {
         tries.add(match(key));
       }
       return tries;
     }

     @Override
     public boolean hasRules() {
       return false;
     }

     @Override
     public RuleCollection getRuleCollection() {
       return new BasicRuleCollection(0, new int[0]);
     }

     @Override
     public Iterator<Integer> getTerminalExtensionIterator() {
       return new ExtensionIterator(lookup, true);
     }

     @Override
     public Iterator<Integer> getNonterminalExtensionIterator() {
       return new ExtensionIterator(lookup, false);
     }
   }

   public final class PackedSlice {
     private final String name;

     private final int[] source;

     private final int[] target;
     private final int[] targetLookup;

     private MappedByteBuffer features;
     private int featureSize;
     private int[] featureLookup;
     private RandomAccessFile featureFile;

     private float[] estimated;
     private float[] precomputable;

     private RandomAccessFile alignmentFile;
     private MappedByteBuffer alignments;
     private int[] alignmentLookup;

     private HashMap<Integer, PackedTrie> tries;

     public PackedSlice(String prefix) throws IOException {
       name = prefix;

       File source_file = new File(prefix + ".source");
       File target_file = new File(prefix + ".target");
       File target_lookup_file = new File(prefix + ".target.lookup");
       File feature_file = new File(prefix + ".features");
       File alignment_file = new File(prefix + ".alignments");

       // Get the channels etc.
       FileInputStream source_fis = new FileInputStream(source_file);
       FileChannel source_channel = source_fis.getChannel();
       int source_size = (int) source_channel.size();

       FileInputStream target_fis = new FileInputStream(target_file);
       FileChannel target_channel = target_fis.getChannel();
       int target_size = (int) target_channel.size();

       featureFile = new RandomAccessFile(feature_file, "r");
       FileChannel feature_channel = featureFile.getChannel();
       int feature_size = (int) feature_channel.size();

       IntBuffer source_buffer = source_channel.map(MapMode.READ_ONLY, 0, source_size).asIntBuffer();
       source = new int[source_size / 4];
       source_buffer.get(source);
       source_fis.close();

       IntBuffer target_buffer = target_channel.map(MapMode.READ_ONLY, 0, target_size).asIntBuffer();
       target = new int[target_size / 4];
       target_buffer.get(target);
       target_fis.close();

       features = feature_channel.map(MapMode.READ_ONLY, 0, feature_size);
       features.load();

       if (alignment_file.exists()) {
         alignmentFile = new RandomAccessFile(alignment_file, "r");
         FileChannel alignment_channel = alignmentFile.getChannel();
         int alignment_size = (int) alignment_channel.size();
         alignments = alignment_channel.map(MapMode.READ_ONLY, 0, alignment_size);
         alignments.load();

         int num_blocks = alignments.getInt(0);
         alignmentLookup = new int[num_blocks];
         int header_pos = 8;
         for (int i = 0; i < num_blocks; i++) {
           alignmentLookup[i] = alignments.getInt(header_pos);
           header_pos += 4;
         }
       } else {
         alignments = null;
       }

       int num_blocks = features.getInt(0);
       featureLookup = new int[num_blocks];
       estimated = new float[num_blocks];
       precomputable = new float[num_blocks];
       featureSize = features.getInt(4);
       int header_pos = 8;
       for (int i = 0; i < num_blocks; i++) {
         featureLookup[i] = features.getInt(header_pos);
         estimated[i] = Float.NEGATIVE_INFINITY;
         precomputable[i] = Float.NEGATIVE_INFINITY;
         header_pos += 4;
       }

       DataInputStream target_lookup_stream = new DataInputStream(new BufferedInputStream(
           new FileInputStream(target_lookup_file)));
       targetLookup = new int[target_lookup_stream.readInt()];
       for (int i = 0; i < targetLookup.length; i++)
         targetLookup[i] = target_lookup_stream.readInt();
       target_lookup_stream.close();

       tries = new HashMap<Integer, PackedTrie>();
     }

     @SuppressWarnings("unused")
     private final Object guardian = new Object() {
       @Override
       // Finalizer object to ensure feature file handle get closed upon slice's dismissal.
       protected void finalize() throws Throwable {
         featureFile.close();
       }
     };

     private final int[] getTarget(int pointer) {
       // Figure out level.
       int tgt_length = 1;
       while (tgt_length < (targetLookup.length + 1) && targetLookup[tgt_length] <= pointer)
         tgt_length++;
       int[] tgt = new int[tgt_length];
       int index = 0;
       int parent;
       do {
         parent = target[pointer];
         if (parent != -1)
           tgt[index++] = target[pointer + 1];
         pointer = parent;
       } while (pointer != -1);
       return tgt;
     }

     private synchronized PackedTrie getTrie(final int node_address) {
       PackedTrie t = tries.get(node_address);
       if (t == null) {
         t = new PackedTrie(node_address);
         tries.put(node_address, t);
       }
       return t;
     }

     private synchronized PackedTrie getTrie(int node_address, int[] parent_src, int parent_arity,
         int symbol) {
       PackedTrie t = tries.get(node_address);
       if (t == null) {
         t = new PackedTrie(node_address, parent_src, parent_arity, symbol);
         tries.put(node_address, t);
       }
       return t;
     }

     /**
      * NEW VERSION
      *
      * Returns a string version of the features associated with a rule (represented as a block ID).
      * These features are in the form "feature1=value feature2=value...". By default, unlabeled
      * features are named using the pattern
      *
      * tm_OWNER_INDEX
      *
      * where OWNER is the grammar's owner (Vocabulary.word(this.owner)) and INDEX is a 0-based index
      * of the feature found in the grammar.
      *
      * @param block_id
      * @return
      */

     private final String getFeatures(int block_id) {
       int feature_position = featureLookup[block_id];

       // The number of non-zero features stored with the rule.
       int num_features = encoding.readId(features, feature_position);

       feature_position += EncoderConfiguration.ID_SIZE;
       StringBuilder sb = new StringBuilder();
       for (int i = 0; i < num_features; i++) {
         int feature_id = encoding.readId(features, feature_position);
         FloatEncoder encoder = encoding.encoder(feature_id);

         String feature_name = Vocabulary.word(encoding.outerId(feature_id));
         try {
           int index = Integer.parseInt(feature_name);
           sb.append(String.format(" tm_%s_%d=%.5f", Vocabulary.word(owner), index,
               -encoder.read(features, feature_position)));
         } catch (NumberFormatException e) {
           sb.append(String.format(" %s=%.5f", feature_name, encoder.read(features, feature_position)));
         }

         feature_position += EncoderConfiguration.ID_SIZE + encoder.size();
       }
       return sb.toString().trim();
     }

     private final byte[] getAlignmentArray(int block_id) {
       if (alignments == null)
         throw new RuntimeException("No alignments available.");
       int alignment_position = alignmentLookup[block_id];
       int num_points = (int) alignments.get(alignment_position);
       byte[] alignment = new byte[num_points * 2];

       alignments.position(alignment_position + 1);
       alignments.get(alignment, 0, num_points * 2);
       return alignment;
     }

     private final PackedTrie root() {
       return getTrie(0);
     }

     public String toString() {
       return name;
     }

     /**
      * A trie node within the grammar slice. Identified by its position within the source array,
      * and, as a supplement, the source string leading from the trie root to the node.
      *
      * @author jg
      *
      */
     public class PackedTrie implements Trie, RuleCollection {

       private final int position;

       private boolean sorted = false;

       private int[] src;
       private int arity;

       private PackedTrie(int position) {
         this.position = position;
         src = new int[0];
         arity = 0;
       }

       private PackedTrie(int position, int[] parent_src, int parent_arity, int symbol) {
         this.position = position;
         src = new int[parent_src.length + 1];
         System.arraycopy(parent_src, 0, src, 0, parent_src.length);
         src[src.length - 1] = symbol;
         arity = parent_arity;
         if (Vocabulary.nt(symbol))
           arity++;
       }

       @Override
       public final Trie match(int token_id) {
         int num_children = source[position];
         if (num_children == 0)
           return null;
         if (num_children == 1 && token_id == source[position + 1])
           return getTrie(source[position + 2], src, arity, token_id);
         int top = 0;
         int bottom = num_children - 1;
         while (true) {
           int candidate = (top + bottom) / 2;
           int candidate_position = position + 1 + 2 * candidate;
           int read_token = source[candidate_position];
           if (read_token == token_id) {
             return getTrie(source[candidate_position + 1], src, arity, token_id);
           } else if (top == bottom) {
             return null;
           } else if (read_token > token_id) {
             top = candidate + 1;
           } else {
             bottom = candidate - 1;
           }
           if (bottom < top)
             return null;
         }
       }

       @Override
       public HashMap<Integer, ? extends Trie> getChildren() {
         HashMap<Integer, Trie> children = new HashMap<Integer, Trie>();
         int num_children = source[position];
         for (int i = 0; i < num_children; i++) {
           int symbol = source[position + 1 + 2 * i];
           int address = source[position + 2 + 2 * i];
           children.put(symbol, getTrie(address, src, arity, symbol));
         }
         return children;
       }

       @Override
       public boolean hasExtensions() {
         return (source[position] != 0);
       }

       @Override
       public ArrayList<? extends Trie> getExtensions() {
         int num_children = source[position];
         ArrayList<PackedTrie> tries = new ArrayList<PackedTrie>(num_children);

         for (int i = 0; i < num_children; i++) {
           int symbol = source[position + 1 + 2 * i];
           int address = source[position + 2 + 2 * i];
           tries.add(getTrie(address, src, arity, symbol));
         }

         return tries;
       }

       @Override
       public boolean hasRules() {
         int num_children = source[position];
         return (source[position + 1 + 2 * num_children] != 0);
       }

       @Override
       public RuleCollection getRuleCollection() {
         return this;
       }

       @Override
       public List<Rule> getRules() {
         int num_children = source[position];
         int rule_position = position + 2 * (num_children + 1);
         int num_rules = source[rule_position - 1];

         ArrayList<Rule> rules = new ArrayList<Rule>(num_rules);
         for (int i = 0; i < num_rules; i++) {
           if (type.equals("moses"))
             rules.add(new PackedPhrasePair(rule_position + 3 * i));
           else
             rules.add(new PackedRule(rule_position + 3 * i));
         }
         return rules;
       }

       /**
        * We determine if the Trie is sorted by checking if the estimated cost of the first rule in
        * the trie has been set.
        */
       @Override
       public boolean isSorted() {
         return sorted;
       }

       private synchronized void sortRules(List<FeatureFunction> models) {
         int num_children = source[position];
         int rule_position = position + 2 * (num_children + 1);
         int num_rules = source[rule_position - 1];
         if (num_rules == 0) {
           this.sorted = true;
           return;
         }
         Integer[] rules = new Integer[num_rules];

         int target_address;
         int block_id;
         for (int i = 0; i < num_rules; ++i) {
           target_address = source[rule_position + 1 + 3 * i];
           rules[i] = rule_position + 2 + 3 * i;
           block_id = source[rules[i]];

           Rule rule = new Rule(source[rule_position + 3 * i], src,
               getTarget(target_address), getFeatures(block_id), arity, owner);
           estimated[block_id] = rule.estimateRuleCost(models);
           precomputable[block_id] = rule.getPrecomputableCost();
         }

         Arrays.sort(rules, new Comparator<Integer>() {
           public int compare(Integer a, Integer b) {
             float a_cost = estimated[source[a]];
             float b_cost = estimated[source[b]];
             if (a_cost == b_cost)
               return 0;
             return (a_cost > b_cost ? -1 : 1);
           }
         });

         int[] sorted = new int[3 * num_rules];
         int j = 0;
         for (int i = 0; i < rules.length; i++) {
           int address = rules[i];
           sorted[j++] = source[address - 2];
           sorted[j++] = source[address - 1];
           sorted[j++] = source[address];
         }
         for (int i = 0; i < sorted.length; i++)
           source[rule_position + i] = sorted[i];
         this.sorted = true;
       }

       @Override
       public List<Rule> getSortedRules(List<FeatureFunction> featureFunctions) {
         if (!isSorted())
           sortRules(featureFunctions);
         return getRules();
       }

       @Override
       public int[] getSourceSide() {
         return src;
       }

       @Override
       public int getArity() {
         return arity;
       }

       @Override
       public Iterator<Integer> getTerminalExtensionIterator() {
         return new PackedChildIterator(position, true);
       }

       @Override
       public Iterator<Integer> getNonterminalExtensionIterator() {
         return new PackedChildIterator(position, false);
       }

       public final class PackedChildIterator implements Iterator<Integer> {

         private int current;
         private boolean terminal;
         private boolean done;
         private int last;

         PackedChildIterator(int position, boolean terminal) {
           this.terminal = terminal;
           int num_children = source[position];
           done = (num_children == 0);
           if (!done) {
             current = (terminal ? position + 1 : position - 1 + 2 * num_children);
             last = (terminal ? position - 1 + 2 * num_children : position + 1);
           }
         }

         @Override
         public boolean hasNext() {
           if (done)
             return false;
           int next = (terminal ? current + 2 : current - 2);
           if (next == last)
             return false;
           return (terminal ? source[next] > 0 : source[next] < 0);
         }

         @Override
         public Integer next() {
           if (done)
             throw new RuntimeException("No more symbols!");
           int symbol = source[current];
           if (current == last)
             done = true;
           if (!done) {
             current = (terminal ? current + 2 : current - 2);
             done = (terminal ? source[current] < 0 : source[current] > 0);
           }
           return symbol;
         }

         @Override
         public void remove() {
           throw new UnsupportedOperationException();
         }
       }

       /**
        * A packed phrase pair represents a rule of the form of a phrase pair, packed with the
        * grammar-packer.pl script, which simply adds a nonterminal [X] to the left-hand side of
        * all phrase pairs (and converts the Moses features). The packer then packs these. We have
        * to then put a nonterminal on the source and target sides to treat the phrase pairs like
        * left-branching rules, which is how Joshua deals with phrase decoding.
        *
        * @author Matt Post <post@cs.jhu.edu>
        *
        */
       public final class PackedPhrasePair extends PackedRule {
         public PackedPhrasePair(int address) {
           super(address);
         }

         @Override
         public int getArity() {
           return PackedTrie.this.getArity() + 1;
         }

         /**
          * Take the English phrase of the underlying rule and prepend an [X].
          *
          * @return
          */
         @Override
         public int[] getEnglish() {
           if (tgt == null) {
             int[] phrase = getTarget(source[address + 1]);
             tgt = new int[phrase.length + 1];
             tgt[0] = -1;
             for (int i = 0; i < phrase.length; i++)
               tgt[i+1] = phrase[i];
           }
           return tgt;
         }


         /**
          * Take the French phrase of the underlying rule and prepend an [X].
          *
          * @return
          */
         @Override
         public int[] getFrench() {
           int phrase[] = new int[src.length + 1];
           int ntid = Vocabulary.id(PackedGrammar.this.joshuaConfiguration.default_non_terminal);
           phrase[0] = ntid;
           System.arraycopy(src,  0, phrase, 1, src.length);
           return phrase;
         }

         /**
          * Similarly the alignment array needs to be shifted over by one.
          *
          * @return
          */
         @Override
         public byte[] getAlignment() {
           // alignments is the underlying raw alignment data
           if (alignments != null) {
             byte[] a = getAlignmentArray(source[address + 2]);
             byte[] points = new byte[a.length + 2];
             points[0] = points[1] = 0;
             for (int i = 0; i < a.length; i++)
               points[i + 2] = (byte) (a[i] + 1);
             return points;
           }
           return null;
         }
       }

       public class PackedRule extends Rule {
         protected final int address;

         protected int[] tgt = null;
         private FeatureVector features = null;

         public PackedRule(int address) {
           this.address = address;
         }

         @Override
         public void setArity(int arity) {
         }

         @Override
         public int getArity() {
           return PackedTrie.this.getArity();
         }

         @Override
         public void setOwner(int ow) {
         }

         @Override
         public int getOwner() {
           return owner;
         }

         @Override
         public void setLHS(int lhs) {
         }

         @Override
         public int getLHS() {
           return source[address];
         }

         @Override
         public void setEnglish(int[] eng) {
         }

         @Override
         public int[] getEnglish() {
           if (tgt == null) {
             tgt = getTarget(source[address + 1]);
           }
           return tgt;
         }

         @Override
         public void setFrench(int[] french) {
         }

         @Override
         public int[] getFrench() {
           return src;
         }

         @Override
         public FeatureVector getFeatureVector() {
           if (features == null) {
             features = new FeatureVector(getFeatures(source[address + 2]), "");
           }

           return features;
         }

         @Override
         public byte[] getAlignment() {
           if (alignments != null)
             return getAlignmentArray(source[address + 2]);
           return null;
         }

         @Override
         public float getEstimatedCost() {
           return estimated[source[address + 2]];
         }

 //        @Override
 //        public void setPrecomputableCost(float cost) {
 //          precomputable[source[address + 2]] = cost;
 //        }

         @Override
         public float getPrecomputableCost() {
           return precomputable[source[address + 2]];
         }

         @Override
         public float estimateRuleCost(List<FeatureFunction> models) {
           return estimated[source[address + 2]];
         }

         @Override
         public String toString() {
           StringBuffer sb = new StringBuffer();
           sb.append(Vocabulary.word(this.getLHS()));
           sb.append(" ||| ");
           sb.append(getFrenchWords());
           sb.append(" ||| ");
           sb.append(getEnglishWords());
           sb.append(" |||");
           sb.append(" " + getFeatureVector());
           sb.append(String.format(" ||| %.3f", getEstimatedCost()));
           return sb.toString();
         }
       }
     }
   }

   @Override
   public boolean isRegexpGrammar() {
     // TODO Auto-generated method stub
     return false;
   }

   @Override
   public void addOOVRules(int word, List<FeatureFunction> featureFunctions) {
     throw new RuntimeException("PackedGrammar: I can't add OOV rules");
   }

   private void readConfig(String config) throws IOException {
     for (String line: new LineReader(config)) {
       String[] tokens = line.split(" = ");
       if (tokens[0].equals("max-source-len"))
         this.maxSourcePhraseLength = Integer.parseInt(tokens[1]);
     }
   }
 }