| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package opennlp.tools.parse_thicket.external_rst; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.nio.charset.Charset; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| |
| import org.apache.commons.io.FileUtils; |
| import org.apache.commons.lang.StringUtils; |
| |
| import edu.stanford.nlp.trees.Tree; |
| import opennlp.tools.parse_thicket.ArcType; |
| import opennlp.tools.parse_thicket.Pair; |
| import opennlp.tools.parse_thicket.ParseThicket; |
| import opennlp.tools.parse_thicket.ParseTreeNode; |
| import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc; |
| import opennlp.tools.parse_thicket.matching.Matcher; |
| import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder; |
| import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer; |
| |
| public class ExternalRSTImporter extends PT2ThicketPhraseBuilder{ |
| private StringDistanceMeasurer strDistProc = new StringDistanceMeasurer (); |
| private String resourceDir = null; |
| |
| public ExternalRSTImporter(){ |
| try { |
| resourceDir = new File( "." ).getCanonicalPath()+"/src/test/resources"; |
| } catch (IOException e) { |
| // TODO Auto-generated catch block |
| e.printStackTrace(); |
| } |
| } |
| |
| public List<RstNode> buildArrayOfRSTnodes(ParseThicket pt, String jotyDumpFileName){ |
| String dump=null; |
| try { |
| dump = FileUtils.readFileToString(new File(jotyDumpFileName), Charset.defaultCharset().toString()); |
| } catch (IOException e) { |
| e.printStackTrace(); |
| return null; |
| } |
| List<RstNode> nodes = new ArrayList<RstNode>(); |
| String[] lines = dump.split("\n"); |
| int startOfDim = StringUtils.lastIndexOf(lines[0], " "); |
| String dimStr = lines[0].substring(startOfDim).replace(")", "").trim(); |
| int dim = Integer.parseInt(dimStr); |
| Integer[][] rstArcsIndices = new Integer[dim][dim]; |
| for(int i=1; i< lines.length; i++){ |
| RstNode node = new RstNode(lines[i]); |
| nodes.add(node); |
| } |
| return nodes; |
| } |
| |
| private Map<String, Integer > phraseRstIndex = new HashMap<String, Integer >(); |
| private Map<Integer, List<ParseTreeNode> > rstIndexPhrase = new HashMap<Integer, List<ParseTreeNode> > (); |
| |
| public List<WordWordInterSentenceRelationArc> buildRSTArcsFromRSTparser( List<RstNode> rstNodes, |
| List<WordWordInterSentenceRelationArc> arcs, |
| Map<Integer, List<List<ParseTreeNode>>> sentNumPhrasesMap, |
| ParseThicket pt ) { |
| List<WordWordInterSentenceRelationArc> arcsRST = new ArrayList<WordWordInterSentenceRelationArc>(); |
| |
| for(int nSentFrom=0; nSentFrom<pt.getSentences().size(); nSentFrom++){ |
| for(int nSentTo=nSentFrom+1; nSentTo<pt.getSentences().size(); nSentTo++){ |
| |
| // label all phrases with EDU |
| List<List<ParseTreeNode>> phrasesFrom = sentNumPhrasesMap.get(nSentFrom); |
| for(List<ParseTreeNode> p: phrasesFrom ){ |
| Integer rstIndex = findBestRstNodeTextForAPhrase(p, rstNodes); |
| if (rstIndex!=null){ |
| phraseRstIndex.put(p.toString(), rstIndex ); |
| rstIndexPhrase.put(rstIndex , p); |
| } |
| } |
| List<List<ParseTreeNode>> phrasesTo = sentNumPhrasesMap.get(nSentTo); |
| for(List<ParseTreeNode> p: phrasesTo ){ |
| Integer rstIndex = findBestRstNodeTextForAPhrase(p, rstNodes); |
| if (rstIndex!=null){ |
| phraseRstIndex.put(p.toString(), rstIndex ); |
| rstIndexPhrase.put(rstIndex , p); |
| } |
| } |
| } |
| } // for a pair of phrases, discover ^ in RST tree which connects these sentences |
| |
| for( int nSentFrom=0; nSentFrom<pt.getSentences().size(); nSentFrom++){ |
| for(int nSentTo=nSentFrom+1; nSentTo<pt.getSentences().size(); nSentTo++){ |
| System.out.println("Sent from # = "+nSentFrom + " -- " + "Sent to # = "+nSentTo); |
| |
| List<List<ParseTreeNode>> phrasesFrom = sentNumPhrasesMap.get(nSentFrom); |
| List<List<ParseTreeNode>> phrasesTo = sentNumPhrasesMap.get(nSentTo); |
| for(List<ParseTreeNode> vpFrom: phrasesFrom){ |
| for(List<ParseTreeNode> vpTo: phrasesTo){ |
| System.out.println("Computing arc between phrases "+ vpFrom + " => " + vpTo); |
| // get two RST nodes |
| Integer rstNodeFrom = phraseRstIndex.get(vpFrom.toString()); |
| Integer rstNodeTo = phraseRstIndex.get(vpTo.toString()); |
| if (rstNodeFrom==null || rstNodeTo==null || rstNodeFrom >= rstNodeTo) |
| continue; |
| System.out.println("Finding RST path for phrases "+ vpFrom + "' and '"+vpTo); |
| System.out.println("Sent from # = "+nSentFrom + " -- " + "Sent to # = "+nSentTo); |
| |
| Integer commonAncestorIndex = findCommonAncestor(rstNodeFrom , rstNodeTo, rstNodes); |
| if (commonAncestorIndex!=null){ |
| // and figure out if they can be properly connected by an arc, by navigating RST tree |
| ArcType arcType = new ArcType("rst", rstNodes.get(rstNodeTo).getRel2par(), 0, 0); |
| WordWordInterSentenceRelationArc arcRST = |
| new WordWordInterSentenceRelationArc(new Pair<Integer, Integer>(nSentFrom, vpFrom.get(0).getId()), |
| new Pair<Integer, Integer>(nSentTo, vpFrom.get(0).getId()), "", "", arcType); |
| arcsRST.add(arcRST); |
| |
| } |
| } |
| } |
| } |
| } |
| |
| return arcsRST; |
| } |
| |
| private Integer findAncestorForRSTnode(Integer rstNodeFrom, List<RstNode> rstNodes){ |
| RstNode initNode = rstNodes.get(rstNodeFrom); |
| if (initNode.level==null) |
| return null; |
| |
| try { |
| int initLevel = initNode.level; |
| int iter=1; // start with moving one step up |
| while (rstNodeFrom-iter>=0) { |
| Integer currLevel= rstNodes.get(rstNodeFrom-iter).level; |
| if ( currLevel!=null && currLevel< initLevel ) // found ancestor |
| return rstNodeFrom-iter; |
| iter++; |
| } |
| } catch (Exception e) { |
| // TODO Auto-generated catch block |
| e.printStackTrace(); |
| } |
| return null; |
| } |
| |
| private Integer findCommonAncestor(Integer rstNodeFrom, Integer rstNodeTo, |
| List<RstNode> rstNodes) { |
| List<Integer> ancestorsFrom = new ArrayList<Integer>() , ancestorsTo = new ArrayList<Integer>(); |
| ancestorsFrom.add(rstNodeFrom); ancestorsTo.add(rstNodeTo); |
| int curLevel = rstNodes.get(rstNodeTo).level; |
| Integer rstNodeFromCurrent = rstNodeFrom, rstNodeToCurrent = rstNodeTo; |
| |
| while(curLevel>0){ |
| if (rstNodeFromCurrent !=null) { |
| rstNodeFromCurrent = findAncestorForRSTnode(rstNodeFromCurrent, rstNodes); |
| } |
| if (rstNodeToCurrent != null){ |
| rstNodeToCurrent = findAncestorForRSTnode(rstNodeToCurrent, rstNodes); |
| } |
| if (rstNodeFromCurrent !=null) { |
| ancestorsFrom.add(rstNodeFromCurrent); |
| } |
| if (rstNodeToCurrent != null) |
| ancestorsTo.add(rstNodeToCurrent); |
| |
| List<Integer> ancestorsFromCurr = new ArrayList<Integer>(ancestorsFrom); |
| ancestorsFromCurr.retainAll(ancestorsTo); |
| if (! ancestorsFromCurr.isEmpty()){ |
| System.out.println("Found comm ancestor "+rstNodes.get(ancestorsFromCurr.get(0)).toString() + " id = "+ancestorsFromCurr.get(0) + |
| " for two RST nodes | id = "+rstNodeFrom + "'"+ |
| rstNodes.get(rstNodeFrom).toString() + "' and | id = "+ rstNodeTo + "'"+ rstNodes.get(rstNodeTo).toString()+"'"); |
| String rel2par = rstNodes.get(ancestorsFromCurr.get(0)).rel2par; |
| // if common ancestor is trivial, return null and do not form a link |
| if (rel2par==null) // || rel2par.equals("span")) |
| return null; |
| else |
| return ancestorsFromCurr.get(0); |
| } |
| curLevel--; |
| } |
| return null; |
| } |
| |
| private Integer findBestRstNodeTextForAPhrase(List<ParseTreeNode> ps, |
| List<RstNode> rstNodes) { |
| // firt get the phrase string |
| String phraseStr=""; |
| for(ParseTreeNode n: ps){ |
| phraseStr+=" "+n.getWord(); |
| } |
| phraseStr= phraseStr.trim(); |
| if (phraseStr.length()<10){ |
| return null; |
| } |
| // now look for closest EDU text from the list of all |
| double rMin = -10000d; Integer index = -1; |
| int count =0; |
| for(RstNode r: rstNodes){ |
| if (r.getText()==null || r.getText().length()<10){ |
| count++; |
| continue; |
| } |
| double res = strDistProc.measureStringDistanceNoStemming(phraseStr, r.getText()); |
| if (res > rMin){ |
| rMin=res; |
| index = count; |
| } |
| count++; |
| } |
| if (index==-1) |
| return null; |
| System.out.println("Found RST node "+ rstNodes.get(index) +" for phrase ="+phraseStr); |
| return index; |
| } |
| |
| /* |
| * Building phrases takes a Parse Thicket and forms phrases for each sentence individually |
| * Then based on built phrases and obtained arcs, it builds arcs for RST |
| * Finally, based on all formed arcs, it extends phrases with thicket phrases |
| */ |
| |
| public List<WordWordInterSentenceRelationArc> buildPT2ptPhrases(ParseThicket pt, String externalRSTresultFilename ) { |
| List<List<ParseTreeNode>> phrasesAllSent = new ArrayList<List<ParseTreeNode>> (); |
| Map<Integer, List<List<ParseTreeNode>>> sentNumPhrases = new HashMap<Integer, List<List<ParseTreeNode>>>(); |
| // build regular phrases |
| for(int nSent=0; nSent<pt.getSentences().size(); nSent++){ |
| List<ParseTreeNode> sentence = pt.getNodesThicket().get(nSent); |
| Tree ptree = pt.getSentences().get(nSent); |
| //ptree.pennPrint(); |
| List<List<ParseTreeNode>> phrases = buildPT2ptPhrasesForASentence(ptree, sentence); |
| System.out.println(phrases); |
| phrasesAllSent.addAll(phrases); |
| sentNumPhrases.put(nSent, phrases); |
| |
| } |
| // TODO: code to run joty suite |
| List<RstNode> rstNodes = new ExternalRSTImporter().buildArrayOfRSTnodes(null, resourceDir+externalRSTresultFilename ); |
| |
| // discover and add RST arcs |
| List<WordWordInterSentenceRelationArc> arcsRST = buildRSTArcsFromRSTparser( rstNodes, null, sentNumPhrases, pt ); |
| System.out.println(arcsRST); |
| return arcsRST; |
| |
| } |
| |
| |
| |
| } |