| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package opennlp.tools.parse_thicket.matching; |
| |
| import java.util.ArrayList; |
| import java.util.HashSet; |
| import java.util.List; |
| |
| import opennlp.tools.parse_thicket.ParseTreeNode; |
| import opennlp.tools.textsimilarity.ParseTreeChunk; |
| |
| public class NERPhraseGeneralizer extends PhraseGeneralizer { |
| |
| /* |
| * alignment is based on NER values, not on POS now |
| */ |
| |
| |
| /** |
| * key matching function which takes two phrases, aligns them and finds a set |
| * of maximum common sub-phrase |
| * |
| * @param chunk1o |
| * @param chunk2o |
| * @return |
| */ |
| @Override |
| public List<ParseTreeChunk> generalize( |
| Object chunk1o, Object chunk2o) { |
| |
| ParseTreeChunk chunk1 = (ParseTreeChunk)chunk1o, chunk2 = (ParseTreeChunk)chunk2o; |
| List<ParseTreeNode> results = new ArrayList<>(); |
| List<ParseTreeChunk> resultChunks = new ArrayList<>(); |
| |
| |
| List<String> pos1 = chunk1.getPOSs(); |
| List<String> pos2 = chunk2.getPOSs(); |
| List<String> lem1 = chunk1.getLemmas(); |
| List<String> lem2 = chunk2.getLemmas(); |
| |
| List<String> ner1 = new ArrayList<>(); |
| List<String> ner2 = new ArrayList<>(); |
| |
| |
| for (ParseTreeNode node: chunk1.getParseTreeNodes()) { |
| if (node.getNe()!=null && !node.getNe().equals("O")) |
| ner1.add(node.getNe()); |
| } |
| |
| for (ParseTreeNode node: chunk2.getParseTreeNodes()) { |
| if (node.getNe()!=null && !node.getNe().equals("O")) |
| ner2.add(node.getNe()); |
| } |
| |
| |
| List<String> overlap = new ArrayList<>(ner1); |
| overlap.retainAll(ner2); |
| overlap = new ArrayList<>(new HashSet<>(overlap)); |
| |
| |
| if (overlap == null || overlap.size() < 1) |
| return null; |
| |
| List<Integer> occur1 = new ArrayList<>(), occur2 = new ArrayList<>(); |
| for (String word : overlap) { |
| Integer i1 = ner1.indexOf(word); |
| Integer i2 = ner2.indexOf(word); |
| occur1.add(i1); |
| occur2.add(i2); |
| } |
| |
| |
| // for verbs find alignment even if no same verb lemmas, just any pair of verbs. Usually should be 0,0 |
| if (chunk1.getMainPOS().startsWith("VP") && chunk2.getMainPOS().startsWith("VP")) { |
| Integer i1 = null, i2 = null; |
| for(int i=0; i< pos1.size(); i++){ |
| if (pos1.get(i).startsWith("VB")){ |
| i1 = i; |
| break; |
| } |
| } |
| |
| for(int i=0; i< pos2.size(); i++){ |
| if (pos2.get(i).startsWith("VB")){ |
| i2 = i; |
| break; |
| } |
| } |
| occur1.add(i1); |
| occur2.add(i2); |
| } |
| // now we search for plausible sublists of overlaps |
| // if at some position correspondence is inverse (one of two position |
| // decreases instead of increases) |
| // then we terminate current alignment accum and start a new one |
| List<List<int[]>> overlapsPlaus = new ArrayList<>(); |
| // starts from 1, not 0 |
| List<int[]> accum = new ArrayList<>(); |
| accum.add(new int[] { occur1.get(0), occur2.get(0) }); |
| for (int i = 1; i < occur1.size(); i++) { |
| |
| if (occur1.get(i) > occur1.get(i - 1) |
| && occur2.get(i) > occur2.get(i - 1)) |
| accum.add(new int[] { occur1.get(i), occur2.get(i) }); |
| else { |
| overlapsPlaus.add(accum); |
| if (occur1!=null && occur2!=null && i<occur1.size() && i<occur2.size() ){ |
| accum = new ArrayList<>(); |
| accum.add(new int[] { occur1.get(i), occur2.get(i) }); |
| } |
| } |
| } |
| if (accum.size() > 0) { |
| overlapsPlaus.add(accum); |
| } |
| |
| |
| for (List<int[]> occur : overlapsPlaus) { |
| List<Integer> occr1 = new ArrayList<>(), occr2 = new ArrayList<>(); |
| for (int[] column : occur) { |
| occr1.add(column[0]); |
| occr2.add(column[1]); |
| } |
| |
| int ov1 = 0, ov2 = 0; // iterators over common words; |
| List<String> commonPOS = new ArrayList<>(), commonLemmas = new ArrayList<>(); |
| // we start two words before first word |
| int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2; |
| boolean bReachedCommonWord = false; |
| while (k1 < 0 || k2 < 0) { |
| k1++; |
| k2++; |
| } |
| int k1max = pos1.size() - 1, k2max = pos2.size() - 1; |
| while (k1 <= k1max && k2 <= k2max) { |
| /* // first check if the same POS |
| String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2)); |
| String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1), |
| lem2.get(k2), sim); |
| */ |
| String sim = null; |
| List<String> sims = posManager.//similarPOS(pos1.get(k1), pos2.get(k2)); |
| generalize(pos1.get(k1), pos2.get(k2)); |
| if (!sims.isEmpty()) |
| sim = sims.get(0); |
| |
| String lemmaMatch = null; |
| List<String> lemmaMatchs = lemmaFormManager.//matchLemmas(ps, |
| generalize(lem1.get(k1), |
| lem2.get(k2)); |
| if (!lemmaMatchs.isEmpty()) |
| lemmaMatch = lemmaMatchs.get(0); |
| |
| |
| |
| if ((sim != null) |
| && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch |
| .equals("fail")))) { |
| commonPOS.add(pos1.get(k1)); |
| |
| |
| // doing parse tree node generalization |
| List<ParseTreeNode> genRes = nodeGen.generalize(chunk1.getParseTreeNodes().get(k1), chunk2.getParseTreeNodes().get(k2)); |
| if (genRes.size()==1) |
| results.add(genRes.get(0)); |
| |
| if (lemmaMatch != null) { |
| commonLemmas.add(lemmaMatch); |
| // System.out.println("Added "+lemmaMatch); |
| if (k1 == occr1.get(ov1) && k2 == occr2.get(ov2)) |
| bReachedCommonWord = true; // now we can have different increment |
| // opera |
| else { |
| if (occr1.size() > ov1 + 1 && occr2.size() > ov2 + 1 |
| && k1 == occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) { |
| ov1++; |
| ov2++; |
| bReachedCommonWord = true; |
| } |
| // else |
| // System.err.println("Next match reached '"+lemmaMatch+ |
| // "' | k1 - k2: "+k1 + " "+k2 + |
| // "| occur index ov1-ov2 "+ |
| // ov1+" "+ov2+ |
| // "| identified positions of match: occr1.get(ov1) - occr2.get(ov1) " |
| // + |
| // occr1.get(ov1) + " "+ occr2.get(ov1)); |
| } |
| } else { |
| commonLemmas.add("*"); |
| } // the same parts of speech, proceed to the next word in both |
| // expressions |
| k1++; |
| k2++; |
| |
| } else if (!bReachedCommonWord) { |
| k1++; |
| k2++; |
| } // still searching |
| else { |
| // different parts of speech, jump to the next identified common word |
| ov1++; |
| ov2++; |
| if (ov1 > occr1.size() - 1 || ov2 > occr2.size() - 1) |
| break; |
| // now trying to find |
| int kk1 = occr1.get(ov1) - 2, // new positions of iterators |
| kk2 = occr2.get(ov2) - 2; |
| int countMove = 0; |
| while ((kk1 < k1 + 1 || kk2 < k2 + 1) && countMove < 2) { // if it is |
| // behind |
| // current |
| // position, |
| // synchronously |
| // move |
| // towards |
| // right |
| kk1++; |
| kk2++; |
| countMove++; |
| } |
| k1 = kk1; |
| k2 = kk2; |
| |
| if (k1 > k1max) |
| k1 = k1max; |
| if (k2 > k2max) |
| k2 = k2max; |
| bReachedCommonWord = false; |
| } |
| } |
| ParseTreeChunk currResult = new ParseTreeChunk(results), |
| currResultOld = new ParseTreeChunk(commonLemmas, commonPOS, 0, 0); |
| |
| |
| resultChunks.add(currResult); |
| } |
| |
| return resultChunks; |
| } |
| |
| } |