| package joshua.decoder.chart_parser; |
| |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.logging.Level; |
| import java.util.logging.Logger; |
| import joshua.corpus.Vocabulary; |
| import joshua.decoder.ff.tm.Grammar; |
| import joshua.decoder.ff.tm.Rule; |
| import joshua.decoder.ff.tm.RuleCollection; |
| import joshua.decoder.ff.tm.Trie; |
| import joshua.lattice.Arc; |
| import joshua.lattice.Lattice; |
| import joshua.lattice.Node; |
| import joshua.util.ChartSpan; |
| |
| /** |
| * The DotChart handles Earley-style implicit binarization of translation rules. |
| * |
| * The {@link DotNode} object represents the (possibly partial) application of a synchronous rule. |
| * The implicit binarization is maintained with a pointer to the {@link Trie} node in the grammar, |
| * for easy retrieval of the next symbol to be matched. At every span (i,j) of the input sentence, |
| * every incomplete DotNode is examined to see whether it (a) needs a terminal and matches against |
| * the final terminal of the span or (b) needs a nonterminal and matches against a completed |
| * nonterminal in the main chart at some split point (k,j). |
| * |
| * Once a rule is completed, it is entered into the {@link DotChart}. {@link DotCell} objects are |
| * used to group completed DotNodes over a span. |
| * |
| * There is a separate DotChart for every grammar. |
| * |
| * @author Zhifei Li, <zhifei.work@gmail.com> |
| * @author Matt Post <post@cs.jhu.edu> |
| * @author Kristy Hollingshead Seitz |
| */ |
| class DotChart { |
| |
| // =============================================================== |
| // Package-protected instance fields |
| // =============================================================== |
| /** |
| * Two-dimensional chart of cells. Some cells might be null. This could definitely be represented |
| * more efficiently, since only the upper half of this triangle is every used. |
| */ |
| private ChartSpan<DotCell> dotcells; |
| |
| public DotCell getDotCell(int i, int j) { |
| return dotcells.get(i, j); |
| } |
| |
| // =============================================================== |
| // Private instance fields (maybe could be protected instead) |
| // =============================================================== |
| |
| /** |
| * CKY+ style parse chart in which completed span entries are stored. |
| */ |
| private Chart dotChart; |
| |
| /** |
| * Translation grammar which contains the translation rules. |
| */ |
| private Grammar pGrammar; |
| |
| /* Length of input sentence. */ |
| private final int sentLen; |
| |
| /* Represents the input sentence being translated. */ |
| private final Lattice<Integer> input; |
| |
| /* If enabled, rule terminals are treated as regular expressions. */ |
| private final boolean regexpMatching; |
| /* |
| * nonTerminalMatcher determines the behavior of nonterminal matching: strict or soft-syntactic |
| * matching |
| */ |
| private final NonterminalMatcher nonTerminalMatcher; |
| |
| |
| // =============================================================== |
| // Static fields |
| // =============================================================== |
| |
| private static final Logger logger = Logger.getLogger(DotChart.class.getName()); |
| |
| // =============================================================== |
| // Constructors |
| // =============================================================== |
| |
| // TODO: Maybe this should be a non-static inner class of Chart. That would give us implicit |
| // access to all the arguments of this constructor. Though we would need to take an argument, i, |
| // to know which Chart.this.grammars[i] to use. |
| |
| /** |
| * Constructs a new dot chart from a specified input lattice, a translation grammar, and a parse |
| * chart. |
| * |
| * @param input A lattice which represents an input sentence. |
| * @param grammar A translation grammar. |
| * @param chart A CKY+ style chart in which completed span entries are stored. |
| */ |
| |
| |
| |
| public DotChart(Lattice<Integer> input, Grammar grammar, Chart chart, |
| NonterminalMatcher nonTerminalMatcher, boolean regExpMatching) { |
| |
| this.dotChart = chart; |
| this.pGrammar = grammar; |
| this.input = input; |
| this.sentLen = input.size(); |
| |
| this.dotcells = new ChartSpan<DotCell>(sentLen, null); |
| this.nonTerminalMatcher = nonTerminalMatcher; |
| this.regexpMatching = regExpMatching; |
| |
| // seeding the dotChart |
| seed(); |
| } |
| |
| |
| /* |
| * public DotChart(Lattice<Integer> input, Grammar grammar, Chart chart, boolean |
| * useRegexpMatching) { this.dotChart = chart; this.pGrammar = grammar; this.input = input; |
| * this.sentLen = input.size(); this.dotcells = new ChartSpan<DotCell>(sentLen, null); |
| * |
| * // seeding the dotChart seed(); |
| * |
| * this.regexpMatching = useRegexpMatching; } |
| */ |
| |
| // =============================================================== |
| // Package-protected methods |
| // =============================================================== |
| |
| /** |
| * Add initial dot items: dot-items pointer to the root of the grammar trie. |
| */ |
| void seed() { |
| for (int j = 0; j <= sentLen - 1; j++) { |
| if (pGrammar.hasRuleForSpan(j, j, input.distance(j, j))) { |
| if (null == pGrammar.getTrieRoot()) { |
| throw new RuntimeException("trie root is null"); |
| } |
| addDotItem(pGrammar.getTrieRoot(), j, j, null, null, new SourcePath()); |
| } |
| } |
| } |
| |
| /** |
| * This function computes all possible expansions of all rules over the provided span (i,j). By |
| * expansions, we mean the moving of the dot forward (from left to right) over a nonterminal or |
| * terminal symbol on the rule's source side. |
| * |
| * There are two kinds of expansions: |
| * |
| * <ol> |
| * <li>Expansion over a nonterminal symbol. For this kind of expansion, a rule has a dot |
| * immediately prior to a source-side nonterminal. The main Chart is consulted to see whether |
| * there exists a completed nonterminal with the same label. If so, the dot is advanced. |
| * |
| * Discovering nonterminal expansions is a matter of enumerating all split points k such that i < |
| * k and k < j. The nonterminal symbol must exist in the main Chart over (k,j). |
| * |
| * <li>Expansion over a terminal symbol. In this case, expansion is a simple matter of determing |
| * whether the input symbol at position j (the end of the span) matches the next symbol in the |
| * rule. This is equivalent to choosing a split point k = j - 1 and looking for terminal symbols |
| * over (k,j). Note that phrases in the input rule are handled one-by-one as we consider longer |
| * spans. |
| * </ol> |
| */ |
| void expandDotCell(int i, int j) { |
| if (logger.isLoggable(Level.FINEST)) |
| logger.finest("Expanding dot cell (" + i + "," + j + ")"); |
| |
| /* |
| * (1) If the dot is just to the left of a non-terminal variable, we look for theorems or axioms |
| * in the Chart that may apply and extend the dot position. We look for existing axioms over all |
| * spans (k,j), i < k < j. |
| */ |
| for (int k = i + 1; k < j; k++) { |
| extendDotItemsWithProvedItems(i, k, j, false); |
| } |
| |
| /* |
| * (2) If the the dot-item is looking for a source-side terminal symbol, we simply match against |
| * the input sentence and advance the dot. |
| */ |
| Node<Integer> node = input.getNode(j - 1); |
| for (Arc<Integer> arc : node.getOutgoingArcs()) { |
| |
| int last_word = arc.getLabel(); |
| int arc_len = arc.getHead().getNumber() - arc.getTail().getNumber(); |
| |
| // int last_word=foreign_sent[j-1]; // input.getNode(j-1).getNumber(); // |
| |
| if (null != dotcells.get(i, j - 1)) { |
| // dotitem in dot_bins[i][k]: looking for an item in the right to the dot |
| |
| |
| for (DotNode dotNode : dotcells.get(i, j - 1).getDotNodes()) { |
| |
| // String arcWord = Vocabulary.word(last_word); |
| // Assert.assertFalse(arcWord.endsWith("]")); |
| // Assert.assertFalse(arcWord.startsWith("[")); |
| // logger.info("DotChart.expandDotCell: " + arcWord); |
| |
| // List<Trie> child_tnodes = ruleMatcher.produceMatchingChildTNodesTerminalevel(dotNode, |
| // last_word); |
| |
| List<Trie> child_tnodes = null; |
| |
| if (this.regexpMatching) { |
| child_tnodes = matchAll(dotNode, last_word); |
| } else { |
| Trie child_node = dotNode.trieNode.match(last_word); |
| child_tnodes = Arrays.asList(child_node); |
| } |
| |
| if (!(child_tnodes == null || child_tnodes.isEmpty())) { |
| for (Trie child_tnode : child_tnodes) { |
| if (null != child_tnode) { |
| addDotItem(child_tnode, i, j - 1 + arc_len, dotNode.antSuperNodes, null, |
| dotNode.srcPath.extend(arc)); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * note: (i,j) is a non-terminal, this cannot be a cn-side terminal, which have been handled in |
| * case2 of dotchart.expand_cell add dotitems that start with the complete super-items in |
| * cell(i,j) |
| */ |
| void startDotItems(int i, int j) { |
| extendDotItemsWithProvedItems(i, i, j, true); |
| } |
| |
| // =============================================================== |
| // Private methods |
| // =============================================================== |
| |
| /** |
| * Attempt to combine an item in the dot chart with an item in the main chart to create a new item |
| * in the dot chart. The DotChart item is a {@link DotNode} begun at position i with the dot |
| * currently at position k, that is, a partially-applied rule. |
| * |
| * In other words, this method looks for (proved) theorems or axioms in the completed chart that |
| * may apply and extend the dot position. |
| * |
| * @param i Start index of a dot chart item |
| * @param k End index of a dot chart item; start index of a completed chart item |
| * @param j End index of a completed chart item |
| * @param skipUnary if true, don't extend unary rules |
| */ |
| private void extendDotItemsWithProvedItems(int i, int k, int j, boolean skipUnary) { |
| if (this.dotcells.get(i, k) == null || this.dotChart.getCell(k, j) == null) { |
| return; |
| } |
| |
| // complete super-items (items over the same span with different LHSs) |
| List<SuperNode> superNodes = new ArrayList<SuperNode>(this.dotChart.getCell(k, j) |
| .getSortedSuperItems().values()); |
| |
| /* For every partially complete item over (i,k) */ |
| for (DotNode dotNode : dotcells.get(i, k).dotNodes) { |
| /* For every completed nonterminal in the main chart */ |
| for (SuperNode superNode : superNodes) { |
| |
| // String arcWord = Vocabulary.word(superNode.lhs); |
| // logger.info("DotChart.extendDotItemsWithProvedItems: " + arcWord); |
| // Assert.assertTrue(arcWord.endsWith("]")); |
| // Assert.assertTrue(arcWord.startsWith("[")); |
| |
| /* |
| * Regular Expression matching allows for a regular-expression style rules in the grammar, |
| * which allows for a very primitive treatment of morphology. This is an advanced, |
| * undocumented feature that introduces a complexity, in that the next "word" in the grammar |
| * rule might match more than one outgoing arc in the grammar trie. |
| */ |
| List<Trie> child_tnodes = nonTerminalMatcher.produceMatchingChildTNodesNonterminalLevel( |
| dotNode, superNode); |
| |
| if (!child_tnodes.isEmpty()) { |
| for (Trie child_tnode : child_tnodes) { |
| if (child_tnode != null) { |
| if ((!skipUnary) || (child_tnode.hasExtensions())) { |
| addDotItem(child_tnode, i, j, dotNode.getAntSuperNodes(), superNode, dotNode |
| .getSourcePath().extendNonTerminal()); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /* |
| * We introduced the ability to have regular expressions in rules for matching against terminals. |
| * For example, you could have the rule |
| * |
| * <pre> [X] ||| l?s herman?s ||| siblings </pre> |
| * |
| * When this is enabled for a grammar, we need to test against *all* (positive) outgoing arcs of |
| * the grammar trie node to see if any of them match, and then return the whole set. This is quite |
| * expensive, which is why you should only enable regular expressions for small grammars. |
| */ |
| |
| private ArrayList<Trie> matchAll(DotNode dotNode, int wordID) { |
| ArrayList<Trie> trieList = new ArrayList<Trie>(); |
| HashMap<Integer, ? extends Trie> childrenTbl = dotNode.trieNode.getChildren(); |
| |
| if (childrenTbl != null && wordID >= 0) { |
| // get all the extensions, map to string, check for *, build regexp |
| for (Integer arcID : childrenTbl.keySet()) { |
| if (arcID == wordID) { |
| trieList.add(childrenTbl.get(arcID)); |
| } else { |
| String arcWord = Vocabulary.word(arcID); |
| if (Vocabulary.word(wordID).matches(arcWord)) { |
| trieList.add(childrenTbl.get(arcID)); |
| } |
| } |
| } |
| } |
| return trieList; |
| } |
| |
| |
| /** |
| * Creates a {@link DotNode} and adds it into the {@link DotChart} at the correct place. These |
| * are (possibly incomplete) rule applications. |
| * |
| * @param tnode the trie node pointing to the location ("dot") in the grammar trie |
| * @param i |
| * @param j |
| * @param antSuperNodesIn the supernodes representing the rule's tail nodes |
| * @param curSuperNode the lefthand side of the rule being created |
| * @param srcPath the path taken through the input lattice |
| */ |
| private void addDotItem(Trie tnode, int i, int j, List<SuperNode> antSuperNodesIn, |
| SuperNode curSuperNode, SourcePath srcPath) { |
| List<SuperNode> antSuperNodes = new ArrayList<SuperNode>(); |
| if (antSuperNodesIn != null) { |
| antSuperNodes.addAll(antSuperNodesIn); |
| } |
| if (curSuperNode != null) { |
| antSuperNodes.add(curSuperNode); |
| } |
| |
| DotNode item = new DotNode(i, j, tnode, antSuperNodes, srcPath); |
| if (dotcells.get(i, j) == null) { |
| dotcells.set(i, j, new DotCell()); |
| } |
| dotcells.get(i, j).addDotNode(item); |
| dotChart.nDotitemAdded++; |
| |
| if (logger.isLoggable(Level.FINEST)) { |
| logger.finest(String.format("Add a dotitem in cell (%d, %d), n_dotitem=%d, %s", i, j, |
| dotChart.nDotitemAdded, srcPath)); |
| |
| RuleCollection rules = tnode.getRuleCollection(); |
| if (rules != null) { |
| for (Rule r : rules.getRules()) { |
| // System.out.println("rule: "+r.toString()); |
| logger.finest(r.toString()); |
| } |
| } |
| } |
| } |
| |
| // =============================================================== |
| // Package-protected classes |
| // =============================================================== |
| |
| /** |
| * A DotCell groups together DotNodes that have been applied over a particular span. A DotNode, in |
| * turn, is a partially-applied grammar rule, represented as a pointer into the grammar trie |
| * structure. |
| */ |
| static class DotCell { |
| |
| // Package-protected fields |
| private List<DotNode> dotNodes = new ArrayList<DotNode>(); |
| |
| public List<DotNode> getDotNodes() { |
| return dotNodes; |
| } |
| |
| private void addDotNode(DotNode dt) { |
| /* |
| * if(l_dot_items==null) l_dot_items= new ArrayList<DotItem>(); |
| */ |
| dotNodes.add(dt); |
| } |
| } |
| |
| /** |
| * A DotNode represents the partial application of a rule rooted to a particular span (i,j). It |
| * maintains a pointer to the trie node in the grammar for efficient mapping. |
| */ |
| static class DotNode { |
| |
| // ======================================================= |
| // Package-protected instance fields |
| // ======================================================= |
| |
| // int i, j; //start and end position in the chart |
| private Trie trieNode = null; // dot_position, point to grammar trie node, this is the only |
| // place that the DotChart points to the grammar |
| private List<SuperNode> antSuperNodes = null; // pointer to SuperNode in Chart |
| private SourcePath srcPath; |
| |
| public DotNode(int i, int j, Trie trieNode, List<SuperNode> antSuperNodes, SourcePath srcPath) { |
| // i = i_in; |
| // j = j_in; |
| this.trieNode = trieNode; |
| this.antSuperNodes = antSuperNodes; |
| this.srcPath = srcPath; |
| } |
| |
| public boolean equals(Object obj) { |
| if (obj == null) |
| return false; |
| if (!this.getClass().equals(obj.getClass())) |
| return false; |
| DotNode state = (DotNode) obj; |
| |
| /* |
| * Technically, we should be comparing the span inforamtion as well, but that would require us |
| * to store it, increasing memory requirements, and we should be able to guarantee that we |
| * won't be comparing DotNodes across spans. |
| */ |
| // if (this.i != state.i || this.j != state.j) |
| // return false; |
| |
| if (this.trieNode != state.trieNode) |
| return false; |
| |
| return true; |
| } |
| |
| /** |
| * Technically the hash should include the span (i,j), but since DotNodes are grouped by span, |
| * this isn't necessary, and we gain something by not having to store the span. |
| */ |
| public int hashCode() { |
| return this.trieNode.hashCode(); |
| } |
| |
| // convenience function |
| public RuleCollection getApplicableRules() { |
| return getTrieNode().getRuleCollection(); |
| } |
| |
| public Trie getTrieNode() { |
| return trieNode; |
| } |
| |
| public SourcePath getSourcePath() { |
| return srcPath; |
| } |
| |
| public List<SuperNode> getAntSuperNodes() { |
| return antSuperNodes; |
| } |
| } |
| |
| } |