| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.textsimilarity; |
| |
| import java.io.Serializable; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.List; |
| import java.util.Map; |
| |
| import org.apache.commons.collections.ListUtils; |
| import org.apache.commons.lang3.StringUtils; |
| |
| import opennlp.tools.parse_thicket.ParseTreeNode; |
| |
| public class ParseTreeChunk implements Serializable { |
| |
| private static final long serialVersionUID = -9007722991829174647L; |
| private String mainPOS; |
| |
| private List<String> lemmas; |
| |
| private List<String> POSs; |
| |
| private int startPos; |
| |
| private int endPos; |
| |
| private int size; |
| |
| private ParseTreeMatcher parseTreeMatcher; |
| |
| private LemmaFormManager lemmaFormManager; |
| |
| private GeneralizationListReducer generalizationListReducer; |
| |
| private List<ParseTreeNode> parseTreeNodes; |
| |
| |
| public List<ParseTreeNode> getParseTreeNodes() { |
| return parseTreeNodes; |
| } |
| |
| public void setParseTreeNodes(List<ParseTreeNode> parseTreeNodes) { |
| this.parseTreeNodes = parseTreeNodes; |
| } |
| |
| public ParseTreeChunk(){} |
| // "[<1>NP'Property':NN, <2>NP'has':VBZ, <3>NP'lots':NNS, <4>NP'of':IN, <5>NP'trash':NN, <6>NP'and':CC, <7>NP'debris':NN]"; |
| |
| public ParseTreeChunk(String phrStr){ |
| String[] parts = phrStr.replace("]","").split(", <"); |
| this.POSs = new ArrayList<>(); |
| this.lemmas = new ArrayList<>(); |
| this.mainPOS = StringUtils.substringBetween(phrStr, ">", "'"); |
| for(String part: parts){ |
| String lemma = StringUtils.substringBetween(part, "P'", "':"); |
| String pos = part.substring(part.indexOf(":")+1, part.length()); |
| |
| if (pos==null || lemma ==null){ |
| continue; |
| } |
| this.POSs.add(pos.trim()); |
| this.lemmas.add(lemma.trim()); |
| } |
| |
| } |
| |
| public ParseTreeChunk(List<String> lemmas, List<String> POSs, int startPos, |
| int endPos) { |
| this.lemmas = lemmas; |
| this.POSs = POSs; |
| this.startPos = startPos; |
| this.endPos = endPos; |
| |
| // phraseType.put(0, "np"); |
| } |
| |
| // constructor which takes lemmas and POS as lists so that phrases can be |
| // conveniently specified. |
| // usage: stand-alone runs |
| public ParseTreeChunk(String mPOS, String[] lemmas, String[] POSss) { |
| this.mainPOS = mPOS; |
| this.lemmas = new ArrayList<>(Arrays.asList(lemmas)); |
| this.POSs = new ArrayList<>(Arrays.asList(POSss)); |
| } |
| |
| // constructor which takes lemmas and POS as lists so that phrases can be |
| // conveniently specified. |
| // usage: stand-alone runs |
| public ParseTreeChunk(String mPOS, List<String> lemmas, List<String> POSss) { |
| this.mainPOS = mPOS; |
| this.lemmas = lemmas; |
| this.POSs = POSss; |
| } |
| |
| |
| public int getStartPos() { |
| return startPos; |
| } |
| |
| public void setStartPos(int startPos) { |
| this.startPos = startPos; |
| } |
| |
| public int getEndPos() { |
| return endPos; |
| } |
| |
| public void setEndPos(int endPos) { |
| this.endPos = endPos; |
| } |
| |
| public int getSize() { |
| return size; |
| } |
| |
| public void setSize(int size) { |
| this.size = size; |
| } |
| |
| public LemmaFormManager getLemmaFormManager() { |
| return lemmaFormManager; |
| } |
| |
| public void setLemmaFormManager(LemmaFormManager lemmaFormManager) { |
| this.lemmaFormManager = lemmaFormManager; |
| } |
| |
| public GeneralizationListReducer getGeneralizationListReducer() { |
| return generalizationListReducer; |
| } |
| |
| public void setGeneralizationListReducer( |
| GeneralizationListReducer generalizationListReducer) { |
| this.generalizationListReducer = generalizationListReducer; |
| } |
| |
| public void setParseTreeMatcher(ParseTreeMatcher parseTreeMatcher) { |
| this.parseTreeMatcher = parseTreeMatcher; |
| } |
| |
| public ParseTreeChunk(List<ParseTreeNode> ps) { |
| this.lemmas = new ArrayList<>(); |
| this.POSs = new ArrayList<>(); |
| for(ParseTreeNode n: ps){ |
| this.lemmas.add(n.getWord()); |
| this.POSs.add(n.getPos()); |
| } |
| |
| if (ps.size()>0){ |
| this.setMainPOS(ps.get(0).getPhraseType()); |
| this.parseTreeNodes = ps; |
| } |
| } |
| |
| public List<ParseTreeChunk> buildChunks(List<LemmaPair> parseResults) { |
| List<ParseTreeChunk> chunksResults = new ArrayList<>(); |
| for (LemmaPair chunk : parseResults) { |
| String[] lemmasAr = chunk.getLemma().split(" "); |
| List<String> poss = new ArrayList<>(), lems = new ArrayList<>(); |
| for (String lem : lemmasAr) { |
| lems.add(lem); |
| // now looking for POSs for individual word |
| for (LemmaPair chunkCur : parseResults) { |
| if (chunkCur.getLemma().equals(lem) |
| && |
| // check that this is a proper word in proper position |
| chunkCur.getEndPos() <= chunk.getEndPos() |
| && chunkCur.getStartPos() >= chunk.getStartPos()) { |
| poss.add(chunkCur.getPOS()); |
| break; |
| } |
| } |
| } |
| if (lems.size() != poss.size()) { |
| System.err.println("lems.size()!= poss.size()"); |
| } |
| if (lems.size() < 2) { // single word phrase, nothing to match |
| continue; |
| } |
| ParseTreeChunk ch = new ParseTreeChunk(lems, poss, chunk.getStartPos(), |
| chunk.getEndPos()); |
| ch.setMainPOS(chunk.getPOS()); |
| chunksResults.add(ch); |
| } |
| return chunksResults; |
| } |
| |
| public List<List<ParseTreeChunk>> matchTwoSentencesGivenPairLists( |
| List<LemmaPair> sent1Pairs, List<LemmaPair> sent2Pairs) { |
| |
| List<ParseTreeChunk> chunk1List = buildChunks(sent1Pairs); |
| List<ParseTreeChunk> chunk2List = buildChunks(sent2Pairs); |
| |
| List<List<ParseTreeChunk>> sent1GrpLst = groupChunksAsParses(chunk1List); |
| List<List<ParseTreeChunk>> sent2GrpLst = groupChunksAsParses(chunk2List); |
| |
| System.out.println("=== Grouped chunks 1 " + sent1GrpLst); |
| System.out.println("=== Grouped chunks 2 " + sent2GrpLst); |
| |
| return matchTwoSentencesGroupedChunks(sent1GrpLst, sent2GrpLst); |
| } |
| |
| // groups noun phrases, verb phrases, propos phrases etc. for separate match |
| |
| public List<List<ParseTreeChunk>> groupChunksAsParses( |
| List<ParseTreeChunk> parseResults) { |
| List<ParseTreeChunk> np = new ArrayList<>(), vp = new ArrayList<>(), prp = new ArrayList<>(), sbarp = new ArrayList<>(), pp = new ArrayList<>(), adjp = new ArrayList<>(), whadvp = new ArrayList<>(), restOfPhrasesTypes = new ArrayList<>(); |
| List<List<ParseTreeChunk>> results = new ArrayList<>(); |
| for (ParseTreeChunk ch : parseResults) { |
| String mainPos = ch.getMainPOS().toLowerCase(); |
| |
| if (mainPos.equals("s")) { |
| continue; |
| } |
| switch (mainPos) { |
| case "np": |
| np.add(ch); |
| break; |
| case "vp": |
| vp.add(ch); |
| break; |
| case "prp": |
| prp.add(ch); |
| break; |
| case "pp": |
| pp.add(ch); |
| break; |
| case "adjp": |
| adjp.add(ch); |
| break; |
| case "whadvp": |
| whadvp.add(ch); |
| break; |
| case "sbar": |
| sbarp.add(ch); |
| break; |
| default: |
| restOfPhrasesTypes.add(ch); |
| break; |
| } |
| |
| } |
| results.add(np); |
| results.add(vp); |
| results.add(prp); |
| results.add(pp); |
| results.add(adjp); |
| results.add(whadvp); |
| results.add(restOfPhrasesTypes); |
| |
| return results; |
| |
| } |
| |
| // main function to generalize two expressions grouped by phrase types |
| // returns a list of generalizations for each phrase type with filtered |
| // sub-expressions |
| public List<List<ParseTreeChunk>> matchTwoSentencesGroupedChunks( |
| List<List<ParseTreeChunk>> sent1, List<List<ParseTreeChunk>> sent2) { |
| List<List<ParseTreeChunk>> results = new ArrayList<>(); |
| // first irerate through component |
| for (int comp = 0; comp < 2 && // just np & vp |
| comp < sent1.size() && comp < sent2.size(); comp++) { |
| List<ParseTreeChunk> resultComps = new ArrayList<>(); |
| // then iterate through each phrase in each component |
| for (ParseTreeChunk ch1 : sent1.get(comp)) { |
| for (ParseTreeChunk ch2 : sent2.get(comp)) { // simpler version |
| ParseTreeChunk chunkToAdd = parseTreeMatcher |
| .generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms( |
| ch1, ch2); |
| |
| if (!lemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd)) { |
| continue; // if the words which have to stay do not stay, proceed to |
| // other elements |
| } |
| boolean alreadyThere = false; |
| for (ParseTreeChunk chunk : resultComps) { |
| if (chunk.equalsTo(chunkToAdd)) { |
| alreadyThere = true; |
| break; |
| } |
| |
| if (parseTreeMatcher |
| .generalizeTwoGroupedPhrasesRandomSelectHighestScore(chunk, |
| chunkToAdd).equalsTo(chunkToAdd)) { |
| alreadyThere = true; |
| break; |
| } |
| } |
| |
| if (!alreadyThere) { |
| resultComps.add(chunkToAdd); |
| } |
| |
| List<ParseTreeChunk> resultCompsReduced = generalizationListReducer |
| .applyFilteringBySubsumption(resultComps); |
| // if (resultCompsReduced.size() != resultComps.size()) |
| // System.out.println("reduction of gen list occurred"); |
| } |
| } |
| results.add(resultComps); |
| } |
| |
| return results; |
| } |
| |
| /* public Boolean equals(ParseTreeChunk ch) { |
| List<String> lems = ch.getLemmas(); |
| List<String> poss = ch.POSs; |
| |
| if (this.lemmas.size() <= lems.size()) |
| return false; // sub-chunk should be shorter than chunk |
| |
| for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) { |
| if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals( |
| poss.get(i)))) |
| return false; |
| } |
| return true; |
| } |
| */ |
| // 'this' is super - chunk of ch, ch is sub-chunk of 'this' |
| public Boolean isASubChunk_OLD(ParseTreeChunk ch) { |
| List<String> lems = ch.getLemmas(); |
| List<String> poss = ch.POSs; |
| |
| if (this.lemmas.size() < lems.size()) |
| return false; // sub-chunk should be shorter than chunk |
| |
| for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) { |
| if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals( |
| poss.get(i)))) |
| return false; |
| } |
| return true; |
| } |
| |
| // this => value ch => * |
| public Boolean isASubChunk(ParseTreeChunk ch) { |
| List<String> lems = ch.getLemmas(); |
| List<String> poss = ch.POSs; |
| |
| if (this.lemmas.size() < lems.size()) |
| return false; // sub-chunk should be shorter than chunk |
| |
| boolean notSubChunkWithGivenAlignment = false, unComparable = false; |
| |
| for (int i = 0; i < lems.size() && i < this.lemmas.size(); i++) { |
| // both lemma and pos are different |
| if (!this.POSs.get(i).equals(poss.get(i)) && !this.lemmas.get(i).equals(lems.get(i)) ){ |
| unComparable = true; |
| break; |
| } |
| |
| // this => * ch=> run |
| if (!this.lemmas.get(i).equals(lems.get(i)) && this.lemmas.get(i).equals("*")) |
| notSubChunkWithGivenAlignment = true; |
| } |
| if (!notSubChunkWithGivenAlignment && !unComparable) |
| return true; |
| |
| List<String> thisPOS = new ArrayList<>(this.POSs); |
| Collections.reverse(thisPOS); |
| List<String> chPOS = new ArrayList<>(poss); |
| Collections.reverse(chPOS); |
| List<String> thisLemma = new ArrayList<>(this.lemmas); |
| Collections.reverse(thisLemma ); |
| List<String> chLemma = new ArrayList<>(lems); |
| Collections.reverse(chLemma); |
| |
| notSubChunkWithGivenAlignment = false; unComparable = false; |
| for (int i = lems.size()-1 ; i>=0; i--) { |
| // both lemma and pos are different |
| if (!thisPOS.get(i).equals(chPOS.get(i)) && !thisLemma.get(i).equals(chLemma.get(i)) ){ |
| unComparable = true; |
| break; |
| } |
| |
| // this => * ch=> run |
| if (!thisLemma.get(i).equals(chLemma.get(i)) && thisLemma.get(i).equals("*")) |
| notSubChunkWithGivenAlignment = true; |
| } |
| |
| if (!notSubChunkWithGivenAlignment && !unComparable) |
| return true; |
| else |
| return false; // then ch is redundant and needs to be removed |
| } |
| |
| public Boolean equalsTo(ParseTreeChunk ch) { |
| List<String> lems = ch.getLemmas(); |
| List<String> poss = ch.POSs; |
| if (this.lemmas.size() != lems.size() || this.POSs.size() != poss.size()) |
| return false; |
| |
| for (int i = 0; i < lems.size(); i++) { |
| if (!(this.lemmas.get(i).equals(lems.get(i)) && this.POSs.get(i).equals( |
| poss.get(i)))) |
| return false; |
| } |
| |
| return true; |
| } |
| |
| public boolean equals(ParseTreeChunk ch) { |
| List<String> lems = ch.getLemmas(); |
| List<String> poss = ch.POSs; |
| return ListUtils.isEqualList(ch.getLemmas(), this.lemmas) && ListUtils.isEqualList(ch.getPOSs(), this.POSs); |
| } |
| |
| public String toString() { |
| StringBuilder buf = new StringBuilder(" ["); |
| if (mainPOS != null) |
| buf = new StringBuilder(mainPOS + " ["); |
| for (int i = 0; i < lemmas.size() && i < POSs.size() ; i++) { |
| buf.append(POSs.get(i)).append("-").append(lemmas.get(i)).append(" "); |
| if (this.parseTreeNodes!=null){ |
| Map<String, Object> attrs = this.parseTreeNodes.get(i).getAttributes(); |
| if (attrs!=null && attrs.keySet().size()>0){ |
| buf.append(attrs).append(" "); |
| } |
| String ner =this.parseTreeNodes.get(i).getNe(); |
| if (ner!=null && ner.length()>1) |
| buf.append("(").append(ner).append(") "); |
| } |
| } |
| return buf + "]"; |
| } |
| |
| public String toWordOnlyString(){ |
| StringBuilder buf = new StringBuilder(); |
| |
| for (String lemma : lemmas) { |
| buf.append(lemma).append(" "); |
| } |
| return buf.toString().trim(); |
| } |
| |
| public int compareTo(ParseTreeChunk o) { |
| if (this.size > o.size) |
| return -1; |
| else |
| return 1; |
| |
| } |
| |
| public String listToString(List<List<ParseTreeChunk>> chunks) { |
| StringBuilder buf = new StringBuilder(); |
| if (chunks.get(0).size() > 0) { |
| buf.append(" np ").append(chunks.get(0).toString()); |
| } |
| if (chunks.get(1).size() > 0) { |
| buf.append(" vp ").append(chunks.get(1).toString()); |
| } |
| if (chunks.size() < 3) { |
| return buf.toString(); |
| } |
| if (chunks.get(2).size() > 0) { |
| buf.append(" prp ").append(chunks.get(2).toString()); |
| } |
| if (chunks.get(3).size() > 0) { |
| buf.append(" pp ").append(chunks.get(3).toString()); |
| } |
| if (chunks.get(4).size() > 0) { |
| buf.append(" adjp ").append(chunks.get(4).toString()); |
| } |
| if (chunks.get(5).size() > 0) { |
| buf.append(" whadvp ").append(chunks.get(5).toString()); |
| } |
| /* |
| * if (mainPos.equals("np")) np.add(ch); else if (mainPos.equals( "vp")) |
| * vp.add(ch); else if (mainPos.equals( "prp")) prp.add(ch); else if |
| * (mainPos.equals( "pp")) pp.add(ch); else if (mainPos.equals( "adjp")) |
| * adjp.add(ch); else if (mainPos.equals( "whadvp")) whadvp.add(ch); |
| */ |
| return buf.toString(); |
| } |
| |
| public List<List<ParseTreeChunk>> obtainParseTreeChunkListByParsingList( |
| String toParse) { |
| List<List<ParseTreeChunk>> results = new ArrayList<>(); |
| // if (toParse.endsWith("]]]")){ |
| // toParse = toParse.replace("[[","").replace("]]",""); |
| // } |
| toParse = toParse.replace(" ]], [ [", "&"); |
| String[] phraseTypeFragments = toParse.trim().split("&"); |
| for (String toParseFragm : phraseTypeFragments) { |
| toParseFragm = toParseFragm.replace("], [", "#"); |
| |
| List<ParseTreeChunk> resultsPhraseType = new ArrayList<>(); |
| String[] indivChunks = toParseFragm.trim().split("#"); |
| for (String expr : indivChunks) { |
| List<String> lems = new ArrayList<>(), poss = new ArrayList<>(); |
| expr = expr.replace("[", "").replace(" ]", ""); |
| String[] pairs = expr.trim().split(" "); |
| for (String word : pairs) { |
| word = word.replace("]]", "").replace("]", ""); |
| String[] pos_lem = word.split("-"); |
| lems.add(pos_lem[1].trim()); |
| poss.add(pos_lem[0].trim()); |
| } |
| ParseTreeChunk ch = new ParseTreeChunk(); |
| ch.setLemmas(lems); |
| ch.setPOSs(poss); |
| resultsPhraseType.add(ch); |
| } |
| results.add(resultsPhraseType); |
| } |
| return results; |
| |
| // 2.1 | Vietnam <b>embassy</b> <b>in</b> <b>Israel</b>: information on how |
| // to get your <b>visa</b> at Vietnam |
| // <b>embassy</b> <b>in</b> <b>Israel</b>. <b>...</b> <b>Spain</b>. |
| // Scotland. Sweden. Slovakia. Switzerland. T |
| // [Top of Page] <b>...</b> |
| // [[ [NN-* IN-in NP-israel ], [NP-* IN-in NP-israel ], [NP-* IN-* TO-* NN-* |
| // ], [NN-visa IN-* NN-* IN-in ]], [ |
| // [VB-get NN-visa IN-* NN-* IN-in .-* ], [VBD-* IN-* NN-* NN-* .-* ], [VB-* |
| // NP-* ]]] |
| |
| } |
| |
| public void setMainPOS(String mainPOS) { |
| this.mainPOS = mainPOS; |
| } |
| |
| public String getMainPOS() { |
| return mainPOS; |
| } |
| |
| public List<String> getLemmas() { |
| return lemmas; |
| } |
| |
| public void setLemmas(List<String> lemmas) { |
| this.lemmas = lemmas; |
| } |
| |
| public List<String> getPOSs() { |
| return POSs; |
| } |
| |
| public void setPOSs(List<String> pOSs) { |
| POSs = pOSs; |
| } |
| |
| public ParseTreeMatcher getParseTreeMatcher() { |
| return parseTreeMatcher; |
| } |
| |
| public static void main(String[] args){ |
| String phrStr = "[<1>NP'Property':NN, <2>NP'has':VBZ, <3>NP'lots':NNS, <4>NP'of':IN, <5>NP'trash':NN, <6>NP'and':CC, <7>NP'debris':NN]"; |
| ParseTreeChunk ch = new ParseTreeChunk(phrStr); |
| System.out.println(ch); |
| } |
| } |