opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/NERPhraseGeneralizer.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.parse_thicket.matching;

 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;


 import opennlp.tools.parse_thicket.IGeneralizer;
 import opennlp.tools.parse_thicket.ParseTreeNode;
 import opennlp.tools.stemmer.PStemmer;
 import opennlp.tools.textsimilarity.GeneralizationListReducer;
 import opennlp.tools.textsimilarity.LemmaFormManager;
 import opennlp.tools.textsimilarity.POSManager;
 import opennlp.tools.textsimilarity.ParseTreeChunk;

 public class NERPhraseGeneralizer extends PhraseGeneralizer {

 	/* alignment is based on NER values, not on POS now
 	 *
 	 */


 	/**
 	 * key matching function which takes two phrases, aligns them and finds a set
 	 * of maximum common sub-phrase
 	 *
 	 * @param chunk1
 	 * @param chunk2
 	 * @return
 	 */
 	@Override
 	public List<ParseTreeChunk> generalize(
 			Object chunk1o, Object chunk2o) {

 		ParseTreeChunk chunk1 = (ParseTreeChunk)chunk1o, chunk2 = (ParseTreeChunk)chunk2o;
 		List<ParseTreeNode> results = new ArrayList<ParseTreeNode>();
 		List<ParseTreeChunk> resultChunks = new ArrayList<ParseTreeChunk>();


 		List<String> pos1 = chunk1.getPOSs();
 		List<String> pos2 = chunk2.getPOSs();
 		List<String> lem1 = chunk1.getLemmas();
 		List<String> lem2 = chunk2.getLemmas();

 		List<String> ner1 = new ArrayList<String>();
 		List<String> ner2 = new ArrayList<String>();


 		for (ParseTreeNode node: chunk1.getParseTreeNodes()) {
 			if (node.getNe()!=null && !node.getNe().equals("O"))
 				ner1.add(node.getNe());
 		}

 		for (ParseTreeNode node: chunk2.getParseTreeNodes()) {
 			if (node.getNe()!=null && !node.getNe().equals("O"))
 				ner2.add(node.getNe());
 		}


 		List<String> overlap = new ArrayList<String>(ner1);
 		overlap.retainAll(ner2);
 		overlap = new ArrayList<String>(new HashSet<String>(overlap));


 		if (overlap == null || overlap.size() < 1)
 			return null;

 		List<Integer> occur1 = new ArrayList<Integer>(), occur2 = new ArrayList<Integer>();
 		for (String word : overlap) {
 			Integer i1 = ner1.indexOf(word);
 			Integer i2 = ner2.indexOf(word);
 			occur1.add(i1);
 			occur2.add(i2);
 		}


 		// for verbs find alignment even if no same verb lemmas, just any pair of verbs. Usually should be 0,0
 		if (chunk1.getMainPOS().startsWith("VP") && chunk2.getMainPOS().startsWith("VP")) {
 			Integer i1 = null, i2 = null;
 			for(int i=0; i< pos1.size(); i++){
 				if (pos1.get(i).startsWith("VB")){
 					i1 = i;
 					break;
 				}
 			}

 			for(int i=0; i< pos2.size(); i++){
 				if (pos2.get(i).startsWith("VB")){
 					i2 = i;
 					break;
 				}
 			}
 			occur1.add(i1);
 			occur2.add(i2);
 		}
 		// now we search for plausible sublists of overlaps
 		// if at some position correspondence is inverse (one of two position
 		// decreases instead of increases)
 		// then we terminate current alignment accum and start a new one
 		List<List<int[]>> overlapsPlaus = new ArrayList<List<int[]>>();
 		// starts from 1, not 0
 		List<int[]> accum = new ArrayList<int[]>();
 		accum.add(new int[] { occur1.get(0), occur2.get(0) });
 		for (int i = 1; i < occur1.size(); i++) {

 			if (occur1.get(i) > occur1.get(i - 1)
 					&& occur2.get(i) > occur2.get(i - 1))
 				accum.add(new int[] { occur1.get(i), occur2.get(i) });
 			else {
 				overlapsPlaus.add(accum);
 				if (occur1!=null && occur2!=null && i<occur1.size() &&  i<occur2.size() ){
 					accum = new ArrayList<int[]>();
 					accum.add(new int[] { occur1.get(i), occur2.get(i) });
 				}
 			}
 		}
 		if (accum.size() > 0) {
 			overlapsPlaus.add(accum);
 		}


 		for (List<int[]> occur : overlapsPlaus) {
 			List<Integer> occr1 = new ArrayList<Integer>(), occr2 = new ArrayList<Integer>();
 			for (int[] column : occur) {
 				occr1.add(column[0]);
 				occr2.add(column[1]);
 			}

 			int ov1 = 0, ov2 = 0; // iterators over common words;
 			List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
 			// we start two words before first word
 			int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2;
 			Boolean bReachedCommonWord = false;
 			while (k1 < 0 || k2 < 0) {
 				k1++;
 				k2++;
 			}
 			int k1max = pos1.size() - 1, k2max = pos2.size() - 1;
 			while (k1 <= k1max && k2 <= k2max) {
 				/*        // first check if the same POS
         String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
         String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1),
             lem2.get(k2), sim);
 				 */
 				String sim = null;
 				List<String> sims = posManager.//similarPOS(pos1.get(k1), pos2.get(k2));
 						generalize(pos1.get(k1), pos2.get(k2));
 				if (!sims.isEmpty())
 					sim = sims.get(0);

 				String lemmaMatch = null;
 				List<String> lemmaMatchs = lemmaFormManager.//matchLemmas(ps,
 						generalize(lem1.get(k1),
 								lem2.get(k2));
 				if (!lemmaMatchs.isEmpty())
 					lemmaMatch = lemmaMatchs.get(0);


 				if ((sim != null)
 						&& (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch
 						.equals("fail")))) {
 					commonPOS.add(pos1.get(k1));


 					// doing parse tree node generalization
 					List<ParseTreeNode> genRes =  nodeGen.generalize(chunk1.getParseTreeNodes().get(k1), chunk2.getParseTreeNodes().get(k2));
 					if (genRes.size()==1)
 						results.add(genRes.get(0));

 					if (lemmaMatch != null) {
 						commonLemmas.add(lemmaMatch);
 						// System.out.println("Added "+lemmaMatch);
 						if (k1 == occr1.get(ov1) && k2 == occr2.get(ov2))
 							bReachedCommonWord = true; // now we can have different increment
 						// opera
 						else {
 							if (occr1.size() > ov1 + 1 && occr2.size() > ov2 + 1
 									&& k1 == occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) {
 								ov1++;
 								ov2++;
 								bReachedCommonWord = true;
 							}
 							// else
 								// System.err.println("Next match reached '"+lemmaMatch+
 							// "' | k1 - k2: "+k1 + " "+k2 +
 							// "| occur index ov1-ov2 "+
 							// ov1+" "+ov2+
 							// "| identified positions of match: occr1.get(ov1) - occr2.get(ov1) "
 							// +
 							// occr1.get(ov1) + " "+ occr2.get(ov1));
 						}
 					} else {
 						commonLemmas.add("*");
 					} // the same parts of speech, proceed to the next word in both
 					// expressions
 					k1++;
 					k2++;

 				} else if (!bReachedCommonWord) {
 					k1++;
 					k2++;
 				} // still searching
 				else {
 					// different parts of speech, jump to the next identified common word
 					ov1++;
 					ov2++;
 					if (ov1 > occr1.size() - 1 || ov2 > occr2.size() - 1)
 						break;
 					// now trying to find
 					int kk1 = occr1.get(ov1) - 2, // new positions of iterators
 							kk2 = occr2.get(ov2) - 2;
 					int countMove = 0;
 					while ((kk1 < k1 + 1 || kk2 < k2 + 1) && countMove < 2) { // if it is
 						// behind
 						// current
 						// position,
 						// synchroneously
 						// move
 						// towards
 						// right
 						kk1++;
 						kk2++;
 						countMove++;
 					}
 					k1 = kk1;
 					k2 = kk2;

 					if (k1 > k1max)
 						k1 = k1max;
 					if (k2 > k2max)
 						k2 = k2max;
 					bReachedCommonWord = false;
 				}
 			}
 			ParseTreeChunk currResult = new ParseTreeChunk(results),
 					currResultOld = new ParseTreeChunk(commonLemmas, commonPOS, 0, 0);


 			resultChunks.add(currResult);
 		}

 		return resultChunks;
 	}

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.parse_thicket.matching;

	import java.util.ArrayList;
	import java.util.HashSet;
	import java.util.List;










	import opennlp.tools.parse_thicket.IGeneralizer;
	import opennlp.tools.parse_thicket.ParseTreeNode;
	import opennlp.tools.stemmer.PStemmer;
	import opennlp.tools.textsimilarity.GeneralizationListReducer;
	import opennlp.tools.textsimilarity.LemmaFormManager;
	import opennlp.tools.textsimilarity.POSManager;
	import opennlp.tools.textsimilarity.ParseTreeChunk;

	public class NERPhraseGeneralizer extends PhraseGeneralizer {

	/* alignment is based on NER values, not on POS now
	*
	*/


	/**
	* key matching function which takes two phrases, aligns them and finds a set
	* of maximum common sub-phrase
	*
	* @param chunk1
	* @param chunk2
	* @return
	*/
	@Override
	public List<ParseTreeChunk> generalize(
	Object chunk1o, Object chunk2o) {

	ParseTreeChunk chunk1 = (ParseTreeChunk)chunk1o, chunk2 = (ParseTreeChunk)chunk2o;
	List<ParseTreeNode> results = new ArrayList<ParseTreeNode>();
	List<ParseTreeChunk> resultChunks = new ArrayList<ParseTreeChunk>();


	List<String> pos1 = chunk1.getPOSs();
	List<String> pos2 = chunk2.getPOSs();
	List<String> lem1 = chunk1.getLemmas();
	List<String> lem2 = chunk2.getLemmas();

	List<String> ner1 = new ArrayList<String>();
	List<String> ner2 = new ArrayList<String>();


	for (ParseTreeNode node: chunk1.getParseTreeNodes()) {
	if (node.getNe()!=null && !node.getNe().equals("O"))
	ner1.add(node.getNe());
	}

	for (ParseTreeNode node: chunk2.getParseTreeNodes()) {
	if (node.getNe()!=null && !node.getNe().equals("O"))
	ner2.add(node.getNe());
	}


	List<String> overlap = new ArrayList<String>(ner1);
	overlap.retainAll(ner2);
	overlap = new ArrayList<String>(new HashSet<String>(overlap));


	if (overlap == null \|\| overlap.size() < 1)
	return null;

	List<Integer> occur1 = new ArrayList<Integer>(), occur2 = new ArrayList<Integer>();
	for (String word : overlap) {
	Integer i1 = ner1.indexOf(word);
	Integer i2 = ner2.indexOf(word);
	occur1.add(i1);
	occur2.add(i2);
	}


	// for verbs find alignment even if no same verb lemmas, just any pair of verbs. Usually should be 0,0
	if (chunk1.getMainPOS().startsWith("VP") && chunk2.getMainPOS().startsWith("VP")) {
	Integer i1 = null, i2 = null;
	for(int i=0; i< pos1.size(); i++){
	if (pos1.get(i).startsWith("VB")){
	i1 = i;
	break;
	}
	}

	for(int i=0; i< pos2.size(); i++){
	if (pos2.get(i).startsWith("VB")){
	i2 = i;
	break;
	}
	}
	occur1.add(i1);
	occur2.add(i2);
	}
	// now we search for plausible sublists of overlaps
	// if at some position correspondence is inverse (one of two position
	// decreases instead of increases)
	// then we terminate current alignment accum and start a new one
	List<List<int[]>> overlapsPlaus = new ArrayList<List<int[]>>();
	// starts from 1, not 0
	List<int[]> accum = new ArrayList<int[]>();
	accum.add(new int[] { occur1.get(0), occur2.get(0) });
	for (int i = 1; i < occur1.size(); i++) {

	if (occur1.get(i) > occur1.get(i - 1)
	&& occur2.get(i) > occur2.get(i - 1))
	accum.add(new int[] { occur1.get(i), occur2.get(i) });
	else {
	overlapsPlaus.add(accum);
	if (occur1!=null && occur2!=null && i<occur1.size() && i<occur2.size() ){
	accum = new ArrayList<int[]>();
	accum.add(new int[] { occur1.get(i), occur2.get(i) });
	}
	}
	}
	if (accum.size() > 0) {
	overlapsPlaus.add(accum);
	}


	for (List<int[]> occur : overlapsPlaus) {
	List<Integer> occr1 = new ArrayList<Integer>(), occr2 = new ArrayList<Integer>();
	for (int[] column : occur) {
	occr1.add(column[0]);
	occr2.add(column[1]);
	}

	int ov1 = 0, ov2 = 0; // iterators over common words;
	List<String> commonPOS = new ArrayList<String>(), commonLemmas = new ArrayList<String>();
	// we start two words before first word
	int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2;
	Boolean bReachedCommonWord = false;
	while (k1 < 0 \|\| k2 < 0) {
	k1++;
	k2++;
	}
	int k1max = pos1.size() - 1, k2max = pos2.size() - 1;
	while (k1 <= k1max && k2 <= k2max) {
	/* // first check if the same POS
	String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
	String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1),
	lem2.get(k2), sim);
	*/
	String sim = null;
	List<String> sims = posManager.//similarPOS(pos1.get(k1), pos2.get(k2));
	generalize(pos1.get(k1), pos2.get(k2));
	if (!sims.isEmpty())
	sim = sims.get(0);

	String lemmaMatch = null;
	List<String> lemmaMatchs = lemmaFormManager.//matchLemmas(ps,
	generalize(lem1.get(k1),
	lem2.get(k2));
	if (!lemmaMatchs.isEmpty())
	lemmaMatch = lemmaMatchs.get(0);



	if ((sim != null)
	&& (lemmaMatch == null \|\| (lemmaMatch != null && !lemmaMatch
	.equals("fail")))) {
	commonPOS.add(pos1.get(k1));


	// doing parse tree node generalization
	List<ParseTreeNode> genRes = nodeGen.generalize(chunk1.getParseTreeNodes().get(k1), chunk2.getParseTreeNodes().get(k2));
	if (genRes.size()==1)
	results.add(genRes.get(0));

	if (lemmaMatch != null) {
	commonLemmas.add(lemmaMatch);
	// System.out.println("Added "+lemmaMatch);
	if (k1 == occr1.get(ov1) && k2 == occr2.get(ov2))
	bReachedCommonWord = true; // now we can have different increment
	// opera
	else {
	if (occr1.size() > ov1 + 1 && occr2.size() > ov2 + 1
	&& k1 == occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) {
	ov1++;
	ov2++;
	bReachedCommonWord = true;
	}
	// else
	// System.err.println("Next match reached '"+lemmaMatch+
	// "' \| k1 - k2: "+k1 + " "+k2 +
	// "\| occur index ov1-ov2 "+
	// ov1+" "+ov2+
	// "\| identified positions of match: occr1.get(ov1) - occr2.get(ov1) "
	// +
	// occr1.get(ov1) + " "+ occr2.get(ov1));
	}
	} else {
	commonLemmas.add("*");
	} // the same parts of speech, proceed to the next word in both
	// expressions
	k1++;
	k2++;

	} else if (!bReachedCommonWord) {
	k1++;
	k2++;
	} // still searching
	else {
	// different parts of speech, jump to the next identified common word
	ov1++;
	ov2++;
	if (ov1 > occr1.size() - 1 \|\| ov2 > occr2.size() - 1)
	break;
	// now trying to find
	int kk1 = occr1.get(ov1) - 2, // new positions of iterators
	kk2 = occr2.get(ov2) - 2;
	int countMove = 0;
	while ((kk1 < k1 + 1 \|\| kk2 < k2 + 1) && countMove < 2) { // if it is
	// behind
	// current
	// position,
	// synchroneously
	// move
	// towards
	// right
	kk1++;
	kk2++;
	countMove++;
	}
	k1 = kk1;
	k2 = kk2;

	if (k1 > k1max)
	k1 = k1max;
	if (k2 > k2max)
	k2 = k2max;
	bReachedCommonWord = false;
	}
	}
	ParseTreeChunk currResult = new ParseTreeChunk(results),
	currResultOld = new ParseTreeChunk(commonLemmas, commonPOS, 0, 0);


	resultChunks.add(currResult);
	}

	return resultChunks;
	}

	}