/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.tools.similarity.apps.taxo_builder; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.HashMap; | |
import java.util.Hashtable; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.logging.Logger; | |
import opennlp.tools.similarity.apps.utils.FileHandler; | |
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; | |
import com.thoughtworks.xstream.XStream; | |
/** | |
* This class can be used to generate scores based on the overlapping between a | |
* text and a given taxonomy. | |
* | |
*/ | |
public class TaxoQuerySnapshotMatcher { | |
ParserChunker2MatcherProcessor sm; | |
// XStream xStream= new XStream(); | |
Map<String, List<List<String>>> lemma_ExtendedAssocWords; | |
TaxonomySerializer taxo; | |
private static Logger LOG = Logger | |
.getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxoQuerySnapshotMatcher"); | |
public TaxoQuerySnapshotMatcher(String taxoFileName) { | |
sm = ParserChunker2MatcherProcessor.getInstance(); | |
taxo = TaxonomySerializer.readTaxonomy(taxoFileName); // "src/test/resources/taxonomies/irs_domTaxo.dat"); | |
} | |
/** | |
* Can be used to generate scores based on the overlapping between a text and | |
* a given taxonomy. | |
* | |
* @param query | |
* The query string the user used for ask a question. | |
* @param snapshot | |
* The abstract of a hit the system gave back | |
* @return | |
*/ | |
public int getTaxoScore(String query, String snapshot) { | |
lemma_ExtendedAssocWords = (HashMap<String, List<List<String>>>) taxo | |
.getLemma_ExtendedAssocWords(); | |
query = query.toLowerCase(); | |
snapshot = snapshot.toLowerCase(); | |
String[] queryWords = sm.getTokenizer().tokenize(query); | |
String[] snapshotWords = sm.getTokenizer().tokenize(snapshot); | |
List<String> queryList = Arrays.asList(queryWords); | |
List<String> snapshotList = Arrays.asList(snapshotWords); | |
List<String> commonBetweenQuerySnapshot = (new ArrayList<String>(queryList)); | |
commonBetweenQuerySnapshot.retainAll(snapshotList);// Still could be | |
// duplicated words (even | |
// more if I would retain | |
// all the opposite ways) | |
int score = 0; | |
List<String> accumCommonParams = new ArrayList<String>(); | |
for (String qWord : commonBetweenQuerySnapshot) { | |
if (!lemma_ExtendedAssocWords.containsKey(qWord)) | |
continue; | |
List<List<String>> foundParams = new ArrayList<List<String>>(); | |
foundParams = lemma_ExtendedAssocWords.get(qWord); | |
for (List<String> paramsForGivenMeaning : foundParams) { | |
paramsForGivenMeaning.retainAll(queryList); | |
paramsForGivenMeaning.retainAll(snapshotList); | |
int size = paramsForGivenMeaning.size(); | |
if (size > 0 && !accumCommonParams.containsAll(paramsForGivenMeaning)) { | |
score += size; | |
accumCommonParams.addAll(paramsForGivenMeaning); | |
} | |
} | |
} | |
return score; | |
} | |
/** | |
* It loads a serialized taxonomy in .dat format and serializes it into a much | |
* more readable XML format. | |
* | |
* @param taxonomyPath | |
* @param taxonomyXML_Path | |
* */ | |
public void convertDatToXML(String taxonomyXML_Path, TaxonomySerializer taxo) { | |
XStream xStream = new XStream(); | |
FileHandler fileHandler = new FileHandler(); | |
try { | |
fileHandler.writeToTextFile(xStream.toXML(taxo), taxonomyXML_Path, false); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
LOG.info(e.toString()); | |
} | |
} | |
public void xmlWork (){ | |
TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher("src/test/resources/taxonomies/irs_domTaxo.dat"); | |
XStream xStream = new XStream(); | |
FileHandler fileHandler = new FileHandler(); | |
matcher.taxo = (TaxonomySerializer)xStream.fromXML(fileHandler.readFromTextFile("src/test/resources/taxo_English.xml")); | |
} | |
public void close() { | |
sm.close(); | |
} | |
/** | |
* demonstrates the usage of the taxonomy matcher | |
* | |
* @param args | |
*/ | |
static public void main(String[] args) { | |
TaxoQuerySnapshotMatcher matcher = new TaxoQuerySnapshotMatcher( | |
"src/test/resources/taxonomies/irs_domTaxo.dat"); | |
System.out | |
.println("The score is: " | |
+ matcher | |
.getTaxoScore( | |
"Can Form 1040 EZ be used to claim the earned income credit.", | |
"Can Form 1040EZ be used to claim the earned income credit? . Must I be entitled to claim a child as a dependent to claim the earned income credit based on the child being ")); | |
} | |
} |