blob: eef19ae8d652ba4a0cabc9b37d3d1c22aa17efee [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.similarity.apps.taxo_builder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;
import opennlp.tools.similarity.apps.utils.FileHandler;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
import com.thoughtworks.xstream.XStream;
/**
* This class can be used to generate scores based on the overlapping between a text and a given taxonomy.
*
*/
public class TaxoQuerySnapshotMatcher {
ParserChunker2MatcherProcessor sm ;
//XStream xStream= new XStream();
Map<String, List<List<String>>> lemma_ExtendedAssocWords;
TaxonomySerializer taxo;
private static Logger LOG = Logger.getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxoQuerySnapshotMatcher");
public TaxoQuerySnapshotMatcher() {
sm = ParserChunker2MatcherProcessor.getInstance();
taxo = TaxonomySerializer.readTaxonomy("src/test/resources/taxonomies/irs_domTaxo.dat");
}
/**
* Can be used to generate scores based on the overlapping between a text and a given taxonomy.
* @param query The query string the user used for ask a question.
* @param snapshot The abstract of a hit the system gave back
* @return
*/
public int getTaxoScore(String query, String snapshot){
lemma_ExtendedAssocWords=(HashMap<String, List<List<String>>>) taxo.getLemma_ExtendedAssocWords();
query=query.toLowerCase();
snapshot=snapshot.toLowerCase();
String[] queryWords = sm.getTokenizer().tokenize(query);
String[] snapshotWords = sm.getTokenizer().tokenize(snapshot);
List<String> queryList = Arrays.asList(queryWords);
List<String> snapshotList = Arrays.asList(snapshotWords);
List<String> commonBetweenQuerySnapshot = (new ArrayList<String>(queryList));
commonBetweenQuerySnapshot.retainAll(snapshotList);//Still could be duplicated words (even more if I would retain all the opposite ways)
int score = 0;
List<String> accumCommonParams = new ArrayList<String>();
for(String qWord: commonBetweenQuerySnapshot){
if (!lemma_ExtendedAssocWords.containsKey(qWord))
continue;
List<List<String>> foundParams = new ArrayList<List<String>>();
foundParams=lemma_ExtendedAssocWords.get(qWord);
for(List<String> paramsForGivenMeaning: foundParams){
paramsForGivenMeaning.retainAll(queryList);
paramsForGivenMeaning.retainAll(snapshotList);
int size = paramsForGivenMeaning.size();
if (size>0 && !accumCommonParams.containsAll(paramsForGivenMeaning)){
score+=size;
accumCommonParams.addAll(paramsForGivenMeaning);
}
}
}
return score;
}
/**
* It loads a serialized taxonomy in .dat format and serializes it into a much more readable XML format.
* @param taxonomyPath
* @param taxonomyXML_Path
* */
public void convertDatToXML(String taxonomyXML_Path, TaxonomySerializer taxo){
XStream xStream = new XStream();
FileHandler fileHandler = new FileHandler();
try {
fileHandler.writeToTextFile(xStream.toXML(taxo), taxonomyXML_Path, false);
} catch (Exception e) {
e.printStackTrace();
LOG.info(e.toString());
}
}
public void xmlWork (){
TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();
XStream xStream = new XStream();
FileHandler fileHandler = new FileHandler();
matcher.taxo = (TaxonomySerializer)xStream.fromXML(fileHandler.readFromTextFile("src/test/resources/taxo_English.xml"));
}
/**
* demonstrates the usage of the taxonomy matcher
* @param args
*/
static public void main(String[] args){
TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher();
System.out.println("The score is: "+matcher.getTaxoScore("Can Form 1040 EZ be used to claim the earned income credit.",
"Can Form 1040EZ be used to claim the earned income credit? . Must I be entitled to claim a child as a dependent to claim the earned income credit based on the child being "));
}
}