opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxoQuerySnapshotMatcher.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.similarity.apps.taxo_builder;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Hashtable;
 import java.util.List;
 import java.util.Map;
 import java.util.logging.Logger;

 import opennlp.tools.similarity.apps.utils.FileHandler;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

 import com.thoughtworks.xstream.XStream;


 /**
  * This class can be used to generate scores based on the overlapping between a text and a given taxonomy.
  *
  */
 public class TaxoQuerySnapshotMatcher {

 	ParserChunker2MatcherProcessor sm ;
     //XStream xStream= new XStream();
     Map<String, List<List<String>>> lemma_ExtendedAssocWords;
     TaxonomySerializer taxo;
     private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxoQuerySnapshotMatcher");


     public TaxoQuerySnapshotMatcher() {
     	sm = ParserChunker2MatcherProcessor.getInstance();
         taxo = TaxonomySerializer.readTaxonomy("src/test/resources/taxonomies/irs_domTaxo.dat");
 	}
 	/**
 	 * Can be used to generate scores based on the overlapping between a text and a given taxonomy.
 	 * @param query The query string the user used for ask a question.
 	 * @param snapshot The abstract of a hit the system gave back
 	 * @return
 	 */
 	public int getTaxoScore(String query, String snapshot){

 		lemma_ExtendedAssocWords=(HashMap<String, List<List<String>>>) taxo.getLemma_ExtendedAssocWords();

 		query=query.toLowerCase();
 		snapshot=snapshot.toLowerCase();
 		String[] queryWords = sm.getTokenizer().tokenize(query);
 		String[] snapshotWords = sm.getTokenizer().tokenize(snapshot);

 		List<String> queryList = Arrays.asList(queryWords);
 		List<String> snapshotList = Arrays.asList(snapshotWords);

 		List<String> commonBetweenQuerySnapshot = (new ArrayList<String>(queryList));
 		commonBetweenQuerySnapshot.retainAll(snapshotList);//Still could be duplicated words (even more if I would retain all the opposite ways)

 		int score = 0;
 		List<String> accumCommonParams = new ArrayList<String>();
 		for(String qWord: commonBetweenQuerySnapshot){
 			if (!lemma_ExtendedAssocWords.containsKey(qWord))
 				continue;
 			List<List<String>> foundParams = new ArrayList<List<String>>();
 			foundParams=lemma_ExtendedAssocWords.get(qWord);

 			for(List<String> paramsForGivenMeaning: foundParams){
 				paramsForGivenMeaning.retainAll(queryList);
 				paramsForGivenMeaning.retainAll(snapshotList);
 				int size = paramsForGivenMeaning.size();

 				if (size>0 && !accumCommonParams.containsAll(paramsForGivenMeaning)){
 					score+=size;
 					accumCommonParams.addAll(paramsForGivenMeaning);
 				}
 			}
 		}
 		return score;
 	}

 	/**
 	 * It loads a serialized taxonomy in .dat format and serializes it into a much more readable XML format.
 	 * @param taxonomyPath
 	 * @param taxonomyXML_Path
 	 * */

 	public void convertDatToXML(String taxonomyXML_Path, TaxonomySerializer taxo){
 		XStream xStream = new XStream();
 		FileHandler fileHandler = new FileHandler();
 		try {
 			fileHandler.writeToTextFile(xStream.toXML(taxo), taxonomyXML_Path, false);
 		} catch (Exception e) {
 				e.printStackTrace();
 				LOG.info(e.toString());
 		}

 	}

 	public void xmlWork (){
 		TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();
 		XStream xStream = new XStream();
 		FileHandler fileHandler = new FileHandler();
 		matcher.taxo = (TaxonomySerializer)xStream.fromXML(fileHandler.readFromTextFile("src/test/resources/taxo_English.xml"));
 	}
 	/**
 	 * demonstrates the usage of the taxonomy matcher
 	 * @param args
 	 */
 	static public void main(String[] args){

 		TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();

 		System.out.println("The score is: "+matcher.getTaxoScore("Can Form 1040 EZ be used to claim the earned income credit.",
 				"Can Form 1040EZ be used to claim the earned income credit? . Must I be entitled to claim a child as a dependent to claim the earned income credit based on the child being "));


 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.similarity.apps.taxo_builder;

	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.HashMap;
	import java.util.Hashtable;
	import java.util.List;
	import java.util.Map;
	import java.util.logging.Logger;

	import opennlp.tools.similarity.apps.utils.FileHandler;
	import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

	import com.thoughtworks.xstream.XStream;


	/**
	* This class can be used to generate scores based on the overlapping between a text and a given taxonomy.
	*
	*/
	public class TaxoQuerySnapshotMatcher {

	ParserChunker2MatcherProcessor sm ;
	//XStream xStream= new XStream();
	Map<String, List<List<String>>> lemma_ExtendedAssocWords;
	TaxonomySerializer taxo;
	private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxoQuerySnapshotMatcher");


	public TaxoQuerySnapshotMatcher() {
	sm = ParserChunker2MatcherProcessor.getInstance();
	taxo = TaxonomySerializer.readTaxonomy("src/test/resources/taxonomies/irs_domTaxo.dat");
	}
	/**
	* Can be used to generate scores based on the overlapping between a text and a given taxonomy.
	* @param query The query string the user used for ask a question.
	* @param snapshot The abstract of a hit the system gave back
	* @return
	*/
	public int getTaxoScore(String query, String snapshot){

	lemma_ExtendedAssocWords=(HashMap<String, List<List<String>>>) taxo.getLemma_ExtendedAssocWords();

	query=query.toLowerCase();
	snapshot=snapshot.toLowerCase();
	String[] queryWords = sm.getTokenizer().tokenize(query);
	String[] snapshotWords = sm.getTokenizer().tokenize(snapshot);

	List<String> queryList = Arrays.asList(queryWords);
	List<String> snapshotList = Arrays.asList(snapshotWords);

	List<String> commonBetweenQuerySnapshot = (new ArrayList<String>(queryList));
	commonBetweenQuerySnapshot.retainAll(snapshotList);//Still could be duplicated words (even more if I would retain all the opposite ways)

	int score = 0;
	List<String> accumCommonParams = new ArrayList<String>();
	for(String qWord: commonBetweenQuerySnapshot){
	if (!lemma_ExtendedAssocWords.containsKey(qWord))
	continue;
	List<List<String>> foundParams = new ArrayList<List<String>>();
	foundParams=lemma_ExtendedAssocWords.get(qWord);

	for(List<String> paramsForGivenMeaning: foundParams){
	paramsForGivenMeaning.retainAll(queryList);
	paramsForGivenMeaning.retainAll(snapshotList);
	int size = paramsForGivenMeaning.size();

	if (size>0 && !accumCommonParams.containsAll(paramsForGivenMeaning)){
	score+=size;
	accumCommonParams.addAll(paramsForGivenMeaning);
	}
	}
	}
	return score;
	}

	/**
	* It loads a serialized taxonomy in .dat format and serializes it into a much more readable XML format.
	* @param taxonomyPath
	* @param taxonomyXML_Path
	* */

	public void convertDatToXML(String taxonomyXML_Path, TaxonomySerializer taxo){
	XStream xStream = new XStream();
	FileHandler fileHandler = new FileHandler();
	try {
	fileHandler.writeToTextFile(xStream.toXML(taxo), taxonomyXML_Path, false);
	} catch (Exception e) {
	e.printStackTrace();
	LOG.info(e.toString());
	}

	}

	public void xmlWork (){
	TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();
	XStream xStream = new XStream();
	FileHandler fileHandler = new FileHandler();
	matcher.taxo = (TaxonomySerializer)xStream.fromXML(fileHandler.readFromTextFile("src/test/resources/taxo_English.xml"));
	}
	/**
	* demonstrates the usage of the taxonomy matcher
	* @param args
	*/
	static public void main(String[] args){

	TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();

	System.out.println("The score is: "+matcher.getTaxoScore("Can Form 1040 EZ be used to claim the earned income credit.",
	"Can Form 1040EZ be used to claim the earned income credit? . Must I be entitled to claim a child as a dependent to claim the earned income credit based on the child being "));


	}
	}