| package joshua.decoder.hypergraph; |
| |
| import java.io.BufferedWriter; |
| import java.io.IOException; |
| import java.io.OutputStreamWriter; |
| import java.util.Arrays; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.PriorityQueue; |
| import java.util.Stack; |
| import java.util.regex.Matcher; |
| |
| import joshua.corpus.Vocabulary; |
| import joshua.decoder.BLEU; |
| import joshua.decoder.JoshuaConfiguration; |
| import joshua.decoder.chart_parser.ComputeNodeResult; |
| import joshua.decoder.ff.FeatureFunction; |
| import joshua.decoder.ff.FeatureVector; |
| import joshua.decoder.ff.fragmentlm.Tree; |
| import joshua.decoder.ff.state_maintenance.DPState; |
| import joshua.decoder.ff.tm.Rule; |
| import joshua.decoder.io.DeNormalize; |
| import joshua.decoder.segment_file.Sentence; |
| |
| /** |
| * This class implements lazy k-best extraction on a hyper-graph. |
| * |
| * K-best extraction over hypergraphs is a little hairy, but is best understood in the following |
| * manner. Imagine a hypergraph, which is composed of nodes connected by hyperedges. A hyperedge has |
| * exactly one parent node and 1 or more tail nodes, corresponding to the rank of the rule that gave |
| * rise to the hyperedge. Each node has 1 or more incoming hyperedges. |
| * |
| * K-best extraction works in the following manner. A derivation is a set of nodes and hyperedges |
| * that leads from the root node down and exactly covers the source-side sentence. To define a |
| * derivation, we start at the root node, choose one of its incoming hyperedges, and then recurse to |
| * the tail (or antecedent) nodes of that hyperedge, where we continually make the same decision. |
| * |
| * Each hypernode has its hyperedges sorted according to their model score. To get the best |
| * (Viterbi) derivation, we simply recursively follow the best hyperedge coming in to each |
| * hypernode. |
| * |
| * How do we get the second-best derivation? It is defined by changing exactly one of the decisions |
| * about which hyperedge to follow in the recursion. Somewhere, we take the second-best. Similarly, |
| * the third-best derivation makes a single change from the second-best: either making another |
| * (differnt) second-best choice somewhere along the 1-best derivation, or taking the third-best |
| * choice at the same spot where the second-best derivation took the second-best choice. And so on. |
| * |
| * This class uses two classes that encode the necessary meta-information. The first is the |
| * DerivationState class. It roughly corresponds to a hyperedge, and records, for each of that |
| * hyperedge's tail nodes, which-best to take. So for a hyperedge with three tail nodes, the 1-best |
| * derivation will be (1,1,1), the second-best will be one of (2,1,1), (1,2,1), or (1,1,2), the |
| * third best will be one of |
| * |
| * (3,1,1), (2,2,1), (1,1,3) |
| * |
| * and so on. |
| * |
| * The configuration parameter `output-format` controls what exactly is extracted from the forest. |
| * See documentation for that below. Note that Joshua does not store individual feature values while |
| * decoding, but only the cost of each edge (in the form of a float). Therefore, if you request |
| * the features values (`%f` in `output-format`), the feature functions must be replayed, which |
| * is expensive. |
| * |
| * The configuration parameter `top-n` controls how many items are returned. If this is set to 0, |
| * k-best extraction should be turned off entirely. |
| * |
| * @author Zhifei Li, <zhifei.work@gmail.com> |
| * @author Matt Post <post@cs.jhu.edu> |
| */ |
| public class KBestExtractor { |
| private final JoshuaConfiguration joshuaConfiguration; |
| private final HashMap<HGNode, VirtualNode> virtualNodesTable = new HashMap<HGNode, VirtualNode>(); |
| |
| // static final String rootSym = JoshuaConfiguration.goal_symbol; |
| static final String rootSym = "ROOT"; |
| static final int rootID = Vocabulary.id(rootSym); |
| |
| private enum Side { |
| SOURCE, TARGET |
| }; |
| |
| /* Whether to extract only unique strings */ |
| private boolean extractUniqueNbest = true; |
| |
| /* Whether to include the alignment information in the output */ |
| @SuppressWarnings("unused") |
| private boolean includeAlign = false; |
| |
| /* Which side to output (source or target) */ |
| private Side defaultSide = Side.TARGET; |
| |
| /* The input sentence */ |
| private Sentence sentence; |
| |
| /* The weights being used to score the forest */ |
| private FeatureVector weights; |
| |
| /* The feature functions */ |
| private List<FeatureFunction> models; |
| |
| /* BLEU statistics of the references */ |
| BLEU.References references = null; |
| |
| public KBestExtractor(Sentence sentence, List<FeatureFunction> models, FeatureVector weights, |
| boolean isMonolingual, JoshuaConfiguration joshuaConfiguration) { |
| |
| this.models = models; |
| |
| this.joshuaConfiguration = joshuaConfiguration; |
| this.extractUniqueNbest = joshuaConfiguration.use_unique_nbest; |
| this.includeAlign = joshuaConfiguration.include_align_index; |
| |
| this.weights = weights; |
| this.defaultSide = (isMonolingual ? Side.SOURCE : Side.TARGET); |
| this.sentence = sentence; |
| |
| if (joshuaConfiguration.rescoreForest) { |
| references = new BLEU.References(sentence.references()); |
| } |
| } |
| |
| /** |
| * Returns the kth derivation. |
| * |
| * You may need to reset_state() before you call this function for the first time. |
| * |
| * @param node the node to start at |
| * @param k the kth best derivation (indexed from 1) |
| * @return the derivation object |
| */ |
| public DerivationState getKthDerivation(HGNode node, int k) { |
| VirtualNode virtualNode = getVirtualNode(node); |
| return virtualNode.lazyKBestExtractOnNode(this, k); |
| } |
| |
| /** |
| * Compute the string that is output from the decoder, using the "output-format" config file |
| * parameter as a template. |
| * |
| * You may need to reset_state() before you call this function for the first time. |
| */ |
| public String getKthHyp(HGNode node, int k) { |
| |
| String outputString = null; |
| |
| // Determine the k-best hypotheses at each HGNode |
| VirtualNode virtualNode = getVirtualNode(node); |
| DerivationState derivationState = virtualNode.lazyKBestExtractOnNode(this, k); |
| // DerivationState derivationState = getKthDerivation(node, k); |
| if (derivationState != null) { |
| // ==== read the kbest from each hgnode and convert to output format |
| FeatureVector features = new FeatureVector(); |
| |
| /* |
| * To save space, the decoder only stores the model cost, no the individual feature values. If |
| * you want to output them, you have to replay them. |
| */ |
| String hypothesis = null; |
| if (joshuaConfiguration.outputFormat.contains("%f") |
| || joshuaConfiguration.outputFormat.contains("%d")) |
| features = derivationState.replayFeatures(); |
| |
| hypothesis = derivationState.getHypothesis() |
| .replaceAll("-lsb-", "[") |
| .replaceAll("-rsb-", "]") |
| .replaceAll("-pipe-", "|"); |
| |
| |
| outputString = joshuaConfiguration.outputFormat |
| .replace("%k", Integer.toString(k)) |
| .replace("%s", hypothesis) |
| .replace("%S", DeNormalize.processSingleLine(hypothesis)) |
| .replace("%i", Integer.toString(sentence.id())) |
| .replace("%f", joshuaConfiguration.moses ? features.mosesString() : features.toString()) |
| .replace("%c", String.format("%.3f", derivationState.cost)); |
| |
| if (joshuaConfiguration.outputFormat.contains("%t")) { |
| outputString = outputString.replace("%t", derivationState.getTree()); |
| } |
| |
| if (joshuaConfiguration.outputFormat.contains("%e")) |
| outputString = outputString.replace("%e", derivationState.getHypothesis(Side.SOURCE)); |
| |
| /* %d causes a derivation with rules one per line to be output */ |
| if (joshuaConfiguration.outputFormat.contains("%d")) { |
| outputString = outputString.replace("%d", derivationState.getDerivation()); |
| } |
| } |
| |
| return outputString; |
| } |
| |
| // =========================== end kbestHypergraph |
| |
| /** |
| * Convenience function for k-best extraction that prints to STDOUT. |
| */ |
| public void lazyKBestExtractOnHG(HyperGraph hg, int topN) throws IOException { |
| lazyKBestExtractOnHG(hg, topN, new BufferedWriter(new OutputStreamWriter(System.out))); |
| } |
| |
| /** |
| * This is the entry point for extracting k-best hypotheses. It computes all of them, writing |
| * the results to the BufferedWriter passed in. If you want intermediate access to the k-best |
| * derivations, you'll want to call getKthHyp() or getKthDerivation() directly. |
| * |
| * The number of derivations that are looked for is controlled by the `top-n` parameter. |
| * Note that when `top-n` is set to 0, k-best extraction is disabled entirely, and only things |
| * like the viterbi string and the model score are available to the decoder. Since k-best |
| * extraction involves the recomputation of features to get the component values, turning off |
| * that extraction saves a lot of time when only the 1-best string is desired. |
| * |
| * @param hg the hypergraph to extract from |
| * @param topN how many to extract |
| * @param out object to write to |
| * @throws IOException |
| */ |
| public void lazyKBestExtractOnHG(HyperGraph hg, int topN, BufferedWriter out) throws IOException { |
| |
| resetState(); |
| |
| if (null == hg.goalNode) |
| return; |
| |
| for (int k = 1; k <= topN; k++) { |
| String hypStr = getKthHyp(hg.goalNode, k); |
| if (null == hypStr) |
| break; |
| |
| out.write(hypStr); |
| out.write("\n"); |
| out.flush(); |
| } |
| } |
| |
| /** |
| * This clears the virtualNodesTable, which maintains a list of virtual nodes. This should be |
| * called in between forest rescorings. |
| */ |
| public void resetState() { |
| virtualNodesTable.clear(); |
| } |
| |
| /** |
| * Returns the VirtualNode corresponding to an HGNode. If no such VirtualNode exists, it is |
| * created. |
| * |
| * @param hgnode |
| * @return the corresponding VirtualNode |
| */ |
| private VirtualNode getVirtualNode(HGNode hgnode) { |
| VirtualNode virtualNode = virtualNodesTable.get(hgnode); |
| if (null == virtualNode) { |
| virtualNode = new VirtualNode(hgnode); |
| virtualNodesTable.put(hgnode, virtualNode); |
| } |
| return virtualNode; |
| } |
| |
| |
| /** |
| * This class is essentially a wrapper around an HGNode, annotating it with information needed to |
| * record which hypotheses have been explored from this point. There is one virtual node for |
| * each HGNode in the underlying hypergraph. This VirtualNode maintains information about the |
| * k-best derivations from that point on, retaining the derivations computed so far and a priority |
| * queue of candidates. |
| */ |
| |
| private class VirtualNode { |
| |
| // The node being annotated. |
| HGNode node = null; |
| |
| // sorted ArrayList of DerivationState, in the paper is: D(^) [v] |
| public List<DerivationState> nbests = new ArrayList<DerivationState>(); |
| |
| // remember frontier states, best-first; in the paper, it is called cand[v] |
| private PriorityQueue<DerivationState> candHeap = null; |
| |
| // Remember which DerivationState has been explored (positions in the hypercube). This allows |
| // us to avoid duplicated states that are reached from different places of expansion, e.g., |
| // position (2,2) can be reached be extending (1,2) and (2,1). |
| private HashSet<DerivationState> derivationTable = null; |
| |
| // This records unique *strings* at each item, used for unique-nbest-string extraction. |
| private HashSet<String> uniqueStringsTable = null; |
| |
| public VirtualNode(HGNode it) { |
| this.node = it; |
| } |
| |
| /** |
| * This returns a DerivationState corresponding to the kth-best derivation rooted at this node. |
| * |
| * @param kbestExtractor |
| * @param k (indexed from one) |
| * @return the k-th best (1-indexed) hypothesis, or null if there are no more. |
| */ |
| // return: the k-th hyp or null; k is started from one |
| private DerivationState lazyKBestExtractOnNode(KBestExtractor kbestExtractor, int k) { |
| if (nbests.size() >= k) { // no need to continue |
| return nbests.get(k - 1); |
| } |
| |
| // ### we need to fill in the l_nest in order to get k-th hyp |
| DerivationState derivationState = null; |
| |
| /* |
| * The first time this is called, the heap of candidates (the frontier of the cube) is |
| * uninitialized. This recursive call will seed the candidates at each node. |
| */ |
| if (null == candHeap) { |
| getCandidates(kbestExtractor); |
| } |
| |
| /* |
| * Now build the kbest list by repeatedly popping the best candidate and then placing all |
| * extensions of that hypothesis back on the candidates list. |
| */ |
| int tAdded = 0; // sanity check |
| while (nbests.size() < k) { |
| if (candHeap.size() > 0) { |
| derivationState = candHeap.poll(); |
| // derivation_tbl.remove(res.get_signature());//TODO: should remove? note that two state |
| // may be tied because the cost is the same |
| if (extractUniqueNbest) { |
| // We pass false for extract_nbest_tree because we want; to check that the hypothesis |
| // *strings* are unique, not the trees. |
| String res_str = derivationState.getHypothesis(); |
| if (!uniqueStringsTable.contains(res_str)) { |
| nbests.add(derivationState); |
| uniqueStringsTable.add(res_str); |
| } |
| } else { |
| nbests.add(derivationState); |
| } |
| |
| // Add all extensions of this hypothesis to the candidates list. |
| lazyNext(kbestExtractor, derivationState); |
| |
| // debug: sanity check |
| tAdded++; |
| // this is possible only when extracting unique nbest |
| if (!extractUniqueNbest && tAdded > 1) { |
| throw new RuntimeException("In lazyKBestExtractOnNode, add more than one time, k is " |
| + k); |
| } |
| } else { |
| break; |
| } |
| } |
| if (nbests.size() < k) { |
| derivationState = null;// in case we do not get to the depth of k |
| } |
| // debug: sanity check |
| // if (l_nbest.size() >= k && l_nbest.get(k-1) != res) { |
| // throw new RuntimeException("In lazy_k_best_extract, ranking is not correct "); |
| // } |
| |
| return derivationState; |
| } |
| |
| /** |
| * This function extends the current hypothesis, adding each extended item to the list of |
| * candidates (assuming they have not been added before). It does this by, in turn, extending |
| * each of the tail node items. |
| * |
| * @param kbestExtractor |
| * @param previousState |
| */ |
| private void lazyNext(KBestExtractor kbestExtractor, DerivationState previousState) { |
| /* If there are no tail nodes, there is nothing to do. */ |
| if (null == previousState.edge.getTailNodes()) |
| return; |
| |
| /* For each tail node, create a new state candidate by "sliding" that item one position. */ |
| for (int i = 0; i < previousState.edge.getTailNodes().size(); i++) { |
| /* Create a new virtual node that is a copy of the current node */ |
| HGNode tailNode = (HGNode) previousState.edge.getTailNodes().get(i); |
| VirtualNode virtualTailNode = kbestExtractor.getVirtualNode(tailNode); |
| // Copy over the ranks. |
| int[] newRanks = new int[previousState.ranks.length]; |
| for (int c = 0; c < newRanks.length; c++) { |
| newRanks[c] = previousState.ranks[c]; |
| } |
| // Now increment/slide the current tail node by one |
| newRanks[i] = previousState.ranks[i] + 1; |
| |
| // Create a new state so we can see if it's new. The cost will be set below if it is. |
| DerivationState nextState = new DerivationState(previousState.parentNode, |
| previousState.edge, newRanks, 0.0f, previousState.edgePos); |
| |
| // Don't add the state to the list of candidates if it's already been added. |
| if (!derivationTable.contains(nextState)) { |
| // Make sure that next candidate exists |
| virtualTailNode.lazyKBestExtractOnNode(kbestExtractor, newRanks[i]); |
| // System.err.println(String.format(" newRanks[%d] = %d and tail size %d", i, |
| // newRanks[i], virtualTailNode.nbests.size())); |
| if (newRanks[i] <= virtualTailNode.nbests.size()) { |
| // System.err.println("NODE: " + this.node); |
| // System.err.println(" tail is " + virtualTailNode.node); |
| float cost = previousState.getModelCost() |
| - virtualTailNode.nbests.get(previousState.ranks[i] - 1).getModelCost() |
| + virtualTailNode.nbests.get(newRanks[i] - 1).getModelCost(); |
| nextState.setCost(cost); |
| |
| if (joshuaConfiguration.rescoreForest) |
| nextState.bleu = nextState.computeBLEU(); |
| |
| candHeap.add(nextState); |
| derivationTable.add(nextState); |
| |
| // System.err.println(String.format(" LAZYNEXT(%s", nextState)); |
| } |
| } |
| } |
| } |
| |
| /** |
| * this is the seeding function, for example, it will get down to the leaf, and sort the |
| * terminals get a 1best from each hyperedge, and add them into the heap_cands |
| * |
| * @param kbestExtractor |
| */ |
| private void getCandidates(KBestExtractor kbestExtractor) { |
| /* The list of candidates extending from this (virtual) node. */ |
| candHeap = new PriorityQueue<DerivationState>(); |
| |
| /* |
| * When exploring the cube frontier, there are multiple paths to each candidate. For example, |
| * going down 1 from grid position (2,1) is the same as going right 1 from grid position |
| * (1,2). To avoid adding states more than once, we keep a list of derivation states we have |
| * already added to the candidates heap. |
| * |
| * TODO: these should really be keyed on the states themselves instead of a string |
| * representation of them. |
| */ |
| derivationTable = new HashSet<DerivationState>(); |
| |
| /* |
| * A Joshua configuration option allows the decoder to output only unique strings. In that |
| * case, we keep an list of the frontiers of derivation states extending from this node. |
| */ |
| if (extractUniqueNbest) { |
| uniqueStringsTable = new HashSet<String>(); |
| } |
| |
| /* |
| * Get the single-best derivation along each of the incoming hyperedges, and add the lot of |
| * them to the priority queue of candidates in the form of DerivationState objects. |
| * |
| * Note that since the hyperedges are not sorted according to score, the first derivation |
| * computed here may not be the best. But since the loop over all hyperedges seeds the entire |
| * candidates list with the one-best along each of them, when the candidate heap is polled |
| * afterwards, we are guaranteed to have the best one. |
| */ |
| int pos = 0; |
| for (HyperEdge edge : node.hyperedges) { |
| DerivationState bestState = getBestDerivation(kbestExtractor, node, edge, pos); |
| // why duplicate, e.g., 1 2 + 1 0 == 2 1 + 0 1 , but here we should not get duplicate |
| if (!derivationTable.contains(bestState)) { |
| candHeap.add(bestState); |
| derivationTable.add(bestState); |
| } else { // sanity check |
| throw new RuntimeException( |
| "get duplicate derivation in get_candidates, this should not happen" |
| + "\nsignature is " + bestState + "\nl_hyperedge size is " |
| + node.hyperedges.size()); |
| } |
| pos++; |
| } |
| |
| // TODO: if tem.size is too large, this may cause unnecessary computation, we comment the |
| // segment to accommodate the unique nbest extraction |
| /* |
| * if(tem.size()>global_n){ heap_cands=new PriorityQueue<DerivationState>(); for(int i=1; |
| * i<=global_n; i++) heap_cands.add(tem.poll()); }else heap_cands=tem; |
| */ |
| } |
| |
| // get my best derivation, and recursively add 1best for all my children, used by get_candidates |
| // only |
| /** |
| * This computes the best derivation along a particular hyperedge. It is only called by |
| * getCandidates() to initialize the candidates priority queue at each (virtual) node. |
| * |
| * @param kbestExtractor |
| * @param parentNode |
| * @param hyperEdge |
| * @param edgePos |
| * @return an object representing the best derivation from this node |
| */ |
| private DerivationState getBestDerivation(KBestExtractor kbestExtractor, HGNode parentNode, |
| HyperEdge hyperEdge, int edgePos) { |
| int[] ranks; |
| float cost = 0.0f; |
| |
| /* |
| * There are two cases: (1) leaf nodes and (2) internal nodes. A leaf node is represented by a |
| * hyperedge with no tail nodes. |
| */ |
| if (hyperEdge.getTailNodes() == null) { |
| ranks = null; |
| |
| } else { |
| // "ranks" records which derivation to take at each of the tail nodes. Ranks are 1-indexed. |
| ranks = new int[hyperEdge.getTailNodes().size()]; |
| |
| /* Initialize the one-best at each tail node. */ |
| for (int i = 0; i < hyperEdge.getTailNodes().size(); i++) { // children is ready |
| ranks[i] = 1; |
| VirtualNode childVirtualNode = kbestExtractor.getVirtualNode(hyperEdge.getTailNodes() |
| .get(i)); |
| // recurse |
| childVirtualNode.lazyKBestExtractOnNode(kbestExtractor, ranks[i]); |
| } |
| } |
| cost = (float) hyperEdge.getBestDerivationScore(); |
| |
| DerivationState state = new DerivationState(parentNode, hyperEdge, ranks, cost, edgePos); |
| if (joshuaConfiguration.rescoreForest) |
| state.bleu = state.computeBLEU(); |
| |
| return state; |
| } |
| }; |
| |
| /** |
| * A DerivationState describes which path to follow through the hypergraph. For example, it |
| * might say to use the 1-best from the first tail node, the 9th-best from the second tail node, |
| * and so on. This information is represented recursively through a chain of DerivationState |
| * objects. This function follows that chain, extracting the information according to a number |
| * of parameters, and returning results to a string, and also (optionally) accumulating the |
| * feature values into the passed-in FeatureVector. |
| */ |
| |
| // each DerivationState roughly corresponds to a hypothesis |
| public class DerivationState implements Comparable<DerivationState> { |
| /* The edge ("e" in the paper) */ |
| public HyperEdge edge; |
| |
| /* The edge's parent node */ |
| public HGNode parentNode; |
| |
| /* |
| * This state's position in its parent node's list of incoming hyperedges (used in signature |
| * calculation) |
| */ |
| public int edgePos; |
| |
| /* |
| * The rank item to select from each of the incoming tail nodes ("j" in the paper, an ArrayList |
| * of size |e|) |
| */ |
| public int[] ranks; |
| |
| /* |
| * The cost of the hypothesis, including a weighted BLEU score, if any. |
| */ |
| private float cost; |
| |
| private float bleu = 0.0f; |
| |
| /* |
| * The BLEU sufficient statistics associated with the edge's derivation. Note that this is a |
| * function of the complete derivation headed by the edge, i.e., all the particular |
| * subderivations of edges beneath it. That is why it must be contained in DerivationState |
| * instead of in the HyperEdge itself. |
| */ |
| BLEU.Stats stats = null; |
| |
| public DerivationState(HGNode pa, HyperEdge e, int[] r, float c, int pos) { |
| parentNode = pa; |
| edge = e; |
| ranks = r; |
| cost = c; |
| edgePos = pos; |
| bleu = 0.0f; |
| } |
| |
| /** |
| * Computes a scaled approximate BLEU from the accumulated statistics. We know the number of |
| * words; to compute the effective reference length, we take the real reference length statistic |
| * and scale it by the percentage of the input sentence that is consumed, based on the |
| * assumption that the total number of words in the hypothesis scales linearly with the input |
| * sentence span. |
| * |
| * @return |
| */ |
| public float computeBLEU() { |
| if (stats == null) { |
| float percentage = 1.0f * (parentNode.j - parentNode.i) / (sentence.length()); |
| // System.err.println(String.format("computeBLEU: (%d - %d) / %d = %f", parentNode.j, |
| // parentNode.i, sentence.length(), percentage)); |
| stats = BLEU.compute(edge, percentage, references); |
| |
| if (edge.getTailNodes() != null) { |
| for (int id = 0; id < edge.getTailNodes().size(); id++) { |
| stats.add(getChildDerivationState(edge, id).stats); |
| } |
| } |
| } |
| |
| return BLEU.score(stats); |
| } |
| |
| public void setCost(float cost2) { |
| this.cost = cost2; |
| } |
| |
| /** |
| * Returns the model cost. This is obtained by subtracting off the incorporated BLEU score (if |
| * used). |
| * |
| * @return |
| */ |
| public float getModelCost() { |
| return this.cost; |
| } |
| |
| /** |
| * Returns the model cost plus the BLEU score. |
| * |
| * @return |
| */ |
| public float getCost() { |
| return cost - weights.getSparse("BLEU") * bleu; |
| } |
| |
| public String toString() { |
| StringBuilder sb = new StringBuilder(String.format("DS[[ %s (%d,%d)/%d ||| ", |
| Vocabulary.word(parentNode.lhs), parentNode.i, parentNode.j, edgePos)); |
| sb.append("ranks=[ "); |
| if (ranks != null) |
| for (int i = 0; i < ranks.length; i++) |
| sb.append(ranks[i] + " "); |
| sb.append("] ||| " + String.format("%.5f ]]", cost)); |
| return sb.toString(); |
| } |
| |
| public boolean equals(Object other) { |
| if (other instanceof DerivationState) { |
| DerivationState that = (DerivationState) other; |
| if (edgePos == that.edgePos) { |
| if (ranks != null && that.ranks != null) { |
| if (ranks.length == that.ranks.length) { |
| for (int i = 0; i < ranks.length; i++) |
| if (ranks[i] != that.ranks[i]) |
| return false; |
| return true; |
| } |
| } |
| } |
| } |
| |
| return false; |
| } |
| |
| /** |
| * DerivationState objects are unique to each VirtualNode, so the unique identifying information |
| * only need contain the edge position and the ranks. |
| */ |
| public int hashCode() { |
| int hash = edgePos; |
| if (ranks != null) { |
| for (int i = 0; i < ranks.length; i++) |
| hash = hash * 53 + i; |
| } |
| |
| return hash; |
| } |
| |
| /** |
| * Visits every state in the derivation in a depth-first order. |
| */ |
| private DerivationVisitor visit(DerivationVisitor visitor) { |
| return visit(visitor, 0); |
| } |
| |
| private DerivationVisitor visit(DerivationVisitor visitor, int indent) { |
| |
| visitor.before(this, indent); |
| |
| Rule rule = edge.getRule(); |
| |
| if (null == rule) { |
| getChildDerivationState(edge, 0).visit(visitor, indent + 1); |
| } else { |
| if (edge.getTailNodes() != null) { |
| int[] english = rule.getEnglish(); |
| for (int c = 0; c < english.length; c++) { |
| if (Vocabulary.nt(english[c])) { |
| int index = -(english[c] + 1); |
| getChildDerivationState(edge, index).visit(visitor, indent + 1); |
| } |
| } |
| } |
| } |
| |
| visitor.after(this, indent); |
| |
| return visitor; |
| } |
| |
| private String getHypothesis() { |
| return getHypothesis(defaultSide); |
| } |
| |
| private String getTree() { |
| return visit(new TreeExtractor()).toString(); |
| } |
| |
| private String getHypothesis(Side side) { |
| return visit(new HypothesisExtractor(side)).toString(); |
| } |
| |
| private FeatureVector replayFeatures() { |
| FeatureReplayer fp = new FeatureReplayer(); |
| visit(fp); |
| return fp.getFeatures(); |
| } |
| |
| private String getDerivation() { |
| return visit(new DerivationExtractor()).toString(); |
| } |
| |
| /** |
| * Helper function for navigating the hierarchical list of DerivationState objects. This |
| * function looks up the VirtualNode corresponding to the HGNode pointed to by the edge's |
| * {tailNodeIndex}th tail node. |
| * |
| * @param edge |
| * @param tailNodeIndex |
| * @return |
| */ |
| public DerivationState getChildDerivationState(HyperEdge edge, int tailNodeIndex) { |
| HGNode child = edge.getTailNodes().get(tailNodeIndex); |
| VirtualNode virtualChild = getVirtualNode(child); |
| return virtualChild.nbests.get(ranks[tailNodeIndex] - 1); |
| } |
| |
| // natural order by cost |
| public int compareTo(DerivationState another) { |
| if (this.getCost() > another.getCost()) { |
| return -1; |
| } else if (this.getCost() == another.getCost()) { |
| return 0; |
| } else { |
| return 1; |
| } |
| } |
| } // end of Class DerivationState |
| |
| /** |
| * This interface provides a generic way to do things at each stage of a derivation. The |
| * DerivationState::visit() function visits every node in a derivation and calls the |
| * DerivationVisitor functions both before and after it visits each node. This provides a common |
| * way to do different things to the tree (e.g., extract its words, assemble a derivation, and so |
| * on) without having to rewrite the node-visiting code. |
| * |
| * @author Matt Post <post@cs.jhu.edu> |
| */ |
| public interface DerivationVisitor { |
| /** |
| * Called before each node's children are visited. |
| * |
| * @param state the derivation state |
| * @param level the tree depth |
| */ |
| void before(DerivationState state, int level); |
| |
| /** |
| * Called after a node's children have been visited. |
| * |
| * @param state the derivation state |
| * @param level the tree depth |
| */ |
| void after(DerivationState state, int level); |
| } |
| |
| /** |
| * Extracts the hypothesis from the leaves of the tree using the generic (depth-first) visitor. |
| * Since we're using the visitor, we can't just print out the words as we see them. We have to |
| * print the words to the left of the nonterminal, then recursively print that nonterminal, then |
| * the words after it, and so on. To accomplish this, we add rules to a stack, merging the words |
| * from terminal productions into the most recent nonterminal in the stack. |
| * |
| */ |
| public class HypothesisExtractor implements DerivationVisitor { |
| |
| private Side side; |
| private Stack<String> outputs; |
| |
| public HypothesisExtractor(Side side) { |
| this.side = side; |
| outputs = new Stack<String>(); |
| } |
| |
| String ntMatcher = ".*" + Rule.NT_REGEX + ".*"; |
| |
| void merge(String words) { |
| if (!words.matches(ntMatcher) && outputs.size() > 0 && outputs.peek().matches(ntMatcher)) { |
| String parentWords = outputs.pop(); |
| String replaced = parentWords.replaceFirst(Rule.NT_REGEX, Matcher.quoteReplacement(words)); |
| |
| merge(replaced); |
| } else { |
| outputs.add(words); |
| } |
| } |
| |
| @Override |
| /** |
| * Whenever we reach a rule in the depth-first derivaiton, we add it to the stack |
| * via a call to the merge() function. |
| */ |
| public void before(DerivationState state, int level) { |
| Rule rule = state.edge.getRule(); |
| if (rule != null) |
| if (side == Side.TARGET) { |
| // Output the alignment Moses-style, skipping <s> and </s>, and converting to indices |
| // in original sentence |
| String alignment = ""; |
| if (joshuaConfiguration.include_align_index |
| && state.parentNode.j != sentence.length() && state.parentNode.j != 1) { |
| int i = state.parentNode.j - 1 - state.edge.getRule().getFrench().length + state.edge.getRule().getArity(); |
| alignment = String.format(" |%d-%d|", i, state.parentNode.j-2); |
| } |
| merge(String.format("%s%s", state.edge.getRule().getEnglishWords(), alignment)); |
| } else |
| merge(state.edge.getRule().getFrenchWords()); |
| |
| } |
| |
| @Override |
| public void after(DerivationState state, int level) { |
| } |
| |
| /** |
| * After all rules in the grammar have been merged, there should be one item on the stack, which |
| * is the complete target (or source) string. |
| */ |
| public String toString() { |
| return outputs.pop().replaceAll("<s> ", "").replace(" </s>", ""); |
| } |
| } |
| |
| /** |
| * Assembles a Penn treebank format tree for a given derivation. |
| */ |
| public class TreeExtractor implements DerivationVisitor { |
| |
| /* The tree being built. */ |
| private Tree tree; |
| |
| public TreeExtractor() { |
| tree = null; |
| } |
| |
| /** |
| * Before visiting the children, find the fragment representation for the current rule, |
| * and merge it into the tree we're building. |
| */ |
| @Override |
| public void before(DerivationState state, int indent) { |
| HyperEdge edge = state.edge; |
| Rule rule = edge.getRule(); |
| |
| // Skip the special top-level rule |
| if (rule == null) { |
| return; |
| } |
| |
| String lhs = Vocabulary.word(rule.getLHS()); |
| String unbracketedLHS = lhs.substring(1, lhs.length() - 1); |
| |
| /* Find the fragment corresponding to this flattened rule in the fragment map; if it's not |
| * there, just pretend it's a depth-one rule. |
| */ |
| Tree fragment = Tree.getFragmentFromYield(rule.getEnglishWords()); |
| if (fragment == null) { |
| String subtree = String.format("(%s %s)", unbracketedLHS, quoteTerminals(rule.getEnglishWords())); |
| fragment = Tree.fromString(subtree); |
| } |
| |
| merge(fragment); |
| } |
| |
| /** |
| * Quotes just the terminals in the yield of a tree, represented as a string. This is to force |
| * compliance with the Tree class, which interprets all non-quoted strings as nonterminals. |
| * |
| * @param words a string of words representing a rule's yield |
| * @return |
| */ |
| private String quoteTerminals(String words) { |
| String quotedWords = ""; |
| for (String word: words.split("\\s+")) |
| if (word.startsWith("[") && word.endsWith("]")) |
| quotedWords += String.format("%s ", word); |
| else |
| quotedWords += String.format("\"%s\" ", word); |
| |
| return quotedWords.substring(0, quotedWords.length() - 1); |
| } |
| |
| @Override |
| public void after(DerivationState state, int indent) { |
| // do nothing |
| } |
| |
| public String toString() { |
| return tree.unquotedString(); |
| } |
| |
| /** |
| * Either set the root of the tree or merge this tree by grafting it onto the first nonterminal |
| * in the yield of the parent tree. |
| * |
| * @param fragment |
| */ |
| private void merge(Tree fragment) { |
| if (tree == null) { |
| tree = fragment; |
| } else { |
| Tree parent = tree.getNonterminalYield().get(0); |
| parent.setLabel(Vocabulary.word(fragment.getLabel())); |
| parent.setChildren(fragment.getChildren()); |
| } |
| } |
| } |
| |
| /** |
| * Assembles an informative version of the derivation. Each rule is printed as it is encountered. |
| * Don't try to parse this output; make something that writes out JSON or something, instead. |
| * |
| * @author Matt Post <post@cs.jhu.edu |
| */ |
| public class DerivationExtractor implements DerivationVisitor { |
| |
| StringBuffer sb; |
| |
| public DerivationExtractor() { |
| sb = new StringBuffer(); |
| } |
| |
| @Override |
| public void before(DerivationState state, int indent) { |
| |
| HyperEdge edge = state.edge; |
| Rule rule = edge.getRule(); |
| |
| if (rule != null) { |
| |
| for (int i = 0; i < indent * 2; i++) |
| sb.append(" "); |
| |
| FeatureReplayer replayer = new FeatureReplayer(); |
| replayer.before(state, indent); |
| FeatureVector transitionFeatures = replayer.getFeatures(); |
| |
| // sb.append(rule).append(" ||| " + features + " ||| " + |
| // KBestExtractor.this.weights.innerProduct(features)); |
| sb.append(String.format("%d-%d", state.parentNode.i, state.parentNode.j)); |
| sb.append(" ||| " + Vocabulary.word(rule.getLHS()) + " -> " |
| + Vocabulary.getWords(rule.getFrench()) + " /// " + rule.getEnglishWords()); |
| sb.append(" |||"); |
| for (DPState dpState : state.parentNode.getDPStates()) { |
| sb.append(" " + dpState); |
| } |
| sb.append(" ||| " + transitionFeatures); |
| sb.append(" ||| " + weights.innerProduct(transitionFeatures)); |
| if (rule.getAlignment() != null) |
| sb.append(" ||| " + Arrays.toString(rule.getAlignment())); |
| sb.append("\n"); |
| } |
| } |
| |
| public String toString() { |
| return sb.toString(); |
| } |
| |
| @Override |
| public void after(DerivationState state, int level) { |
| // TODO Auto-generated method stub |
| } |
| } |
| |
| /** |
| * During decoding, individual features values are not stored, only the model score on each edge. |
| * This saves space. If you want to print the actual feature values, they have to be assembled |
| * from the edges of the derivation, which means replaying the feature functions. This visitor |
| * does just that, using the generic derivation visitor. |
| */ |
| public class FeatureReplayer implements DerivationVisitor { |
| |
| private FeatureVector features; |
| |
| public FeatureReplayer() { |
| features = new FeatureVector(); |
| } |
| |
| public FeatureReplayer(FeatureVector useThese) { |
| features = useThese; |
| } |
| |
| public FeatureVector getFeatures() { |
| return features; |
| } |
| |
| /** |
| * We could do this in either before() or after(). |
| */ |
| @Override |
| public void before(DerivationState state, int level) { |
| if (features != null) { |
| HGNode parentNode = state.parentNode; |
| |
| HyperEdge edge = state.edge; |
| |
| FeatureVector transitionCosts = ComputeNodeResult.computeTransitionFeatures(models, edge, |
| parentNode.i, parentNode.j, sentence); |
| |
| features.add(transitionCosts); |
| } |
| } |
| |
| @Override |
| public void after(DerivationState state, int level) { |
| // Nothing to do |
| } |
| } |
| } |