/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.tools.parse_thicket.kernel_interface; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.List; | |
import java.util.logging.Logger; | |
import edu.stanford.nlp.trees.Tree; | |
import opennlp.tools.jsmlearning.ProfileReaderWriter; | |
import opennlp.tools.parse_thicket.ParseThicket; | |
import opennlp.tools.parse_thicket.apps.BingQueryRunnerMultipageSearchResults; | |
import opennlp.tools.parse_thicket.apps.MultiSentenceSearchResultsProcessor; | |
import opennlp.tools.parse_thicket.apps.SnippetToParagraph; | |
import opennlp.tools.parse_thicket.matching.Matcher; | |
import opennlp.tools.similarity.apps.BingQueryRunner; | |
import opennlp.tools.similarity.apps.HitBase; | |
import opennlp.tools.similarity.apps.HitBaseComparable; | |
import opennlp.tools.similarity.apps.WebSearchEngineResultsScraper; | |
import opennlp.tools.textsimilarity.ParseTreeChunk; | |
import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; | |
import opennlp.tools.textsimilarity.SentencePairMatchResult; | |
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; | |
public class MultiSentenceKernelBasedSearchResultsProcessor extends MultiSentenceSearchResultsProcessor{ | |
private static Logger LOG = Logger | |
.getLogger("opennlp.tools.similarity.apps.MultiSentenceKernelBasedSearchResultsProcessor"); | |
private WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper(); | |
protected Matcher matcher = new Matcher(); | |
private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer(); | |
protected BingQueryRunnerMultipageSearchResults bingSearcher = new BingQueryRunnerMultipageSearchResults(); | |
private SnippetToParagraph snp = new SnippetToParagraph(); | |
private TreeKernelRunner tkRunner = new TreeKernelRunner(); | |
private String path; | |
public void setKernelPath (String path){ | |
this.path=path; | |
} | |
protected static final String modelFileName = "model.txt"; | |
private static final String trainingFileName = "training.txt"; | |
protected static final String unknownToBeClassified = "unknown.txt"; | |
private static final String classifierOutput = "classifier_output.txt"; | |
public List<HitBase> runSearchViaAPI(String query) { | |
List<HitBase> hits = null; | |
try { | |
List<HitBase> resultList = bingSearcher.runSearch(query); | |
// now we apply our own relevance filter | |
//hits = calculateMatchScoreResortHits(resultList, query); | |
hits = resultList; | |
//once we applied our re-ranking, we set highly ranked as positive set, low-rated as negative set | |
//and classify all these search results again | |
//training set is formed from original documents for the search results, | |
// and snippets of these search results are classified | |
hits = filterOutIrrelevantHitsByTreeKernelLearning(hits, query); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
LOG.info("No search results for query '" + query); | |
return null; | |
} | |
return hits; | |
} | |
private List<HitBase> filterOutIrrelevantHitsByTreeKernelLearning( | |
List<HitBase> hits, String query) { | |
List<HitBase> newHitList = new ArrayList<HitBase>(), newHitListReRanked = new ArrayList<HitBase>(); | |
// form the training set from original documents. Since search results are ranked, we set the first half as positive set, | |
//and the second half as negative set. | |
// after re-classification, being re-ranked, the search results might end up in a different set | |
List<String[]> treeBankBuffer = new ArrayList<String[]>(); | |
int count = 0; | |
for (HitBase hit : hits) { | |
count++; | |
// if orig content has been already set in HIT object, ok; otherwise set it | |
String searchResultText = hit.getPageContent(); | |
if (searchResultText ==null){ | |
String[] pageSentsAndSnippet = formTextForReRankingFromHit(hit); | |
searchResultText = pageSentsAndSnippet[0]; | |
hit.setPageContent(searchResultText); | |
} | |
newHitList.add(hit); | |
treeBankBuffer.addAll(formTreeKernelStructure(searchResultText, count, hits)); | |
} | |
// write the lits of samples to a file | |
ProfileReaderWriter.writeReport(treeBankBuffer, path+trainingFileName, ' '); | |
// build the model | |
tkRunner.runLearner(path, trainingFileName, modelFileName); | |
// now we preparing the same answers to be classifies in/out | |
treeBankBuffer = new ArrayList<String[]>(); | |
for (HitBase hit : newHitList) { | |
// not original docs now but instead a snippet | |
String searchResultTextAbstr = hit.getAbstractText(); | |
String snippet = searchResultTextAbstr.replace("<b>...</b>", ". ").replace("<span class='best-phrase'>", " ").replace("<span>", " ").replace("<span>", " ") | |
.replace("<b>", "").replace("</b>", ""); | |
snippet = snippet.replace("</B>", "").replace("<B>", "") | |
.replace("<br>", "").replace("</br>", "").replace("...", ". ") | |
.replace("|", " ").replace(">", " ").replace(". .", ". "); | |
snippet = hit.getTitle() + " " + snippet; | |
ParseThicket pt = matcher.buildParseThicketFromTextWithRST(snippet); | |
//hit.getPageContent()); | |
List<Tree> forest = pt.getSentences(); | |
// we consider the snippet as a single sentence to be classified | |
if (forest.size()>0){ | |
treeBankBuffer.add(new String[] {"0 |BT| "+forest.get(0).toString()+ " |ET|"}); | |
newHitListReRanked .add(hit); | |
} | |
} | |
// form a file from the snippets to be classified | |
ProfileReaderWriter.writeReport(treeBankBuffer, path+unknownToBeClassified, ' '); | |
tkRunner.runClassifier(path, unknownToBeClassified, modelFileName, classifierOutput); | |
// read classification results | |
List<String[]> classifResults = ProfileReaderWriter.readProfiles(path+classifierOutput, ' '); | |
// iterate through classification results and set them as scores for hits | |
newHitList = new ArrayList<HitBase>(); | |
for(int i=0; i<newHitListReRanked.size() && i<classifResults.size() ; i++){ | |
String scoreClassif = classifResults.get(i)[0]; | |
float val = Float.parseFloat(scoreClassif); | |
HitBase hit = newHitListReRanked.get(i); | |
hit.setGenerWithQueryScore((double) val); | |
newHitList.add(hit); | |
} | |
// sort by SVM classification results | |
Collections.sort(newHitList, new HitBaseComparable()); | |
System.out.println("\n\n ============= NEW ORDER ================= "); | |
for (HitBase hit : newHitList) { | |
System.out.println(hit.getOriginalSentences().toString() + " => "+hit.getGenerWithQueryScore()); | |
System.out.println("page content = "+hit.getPageContent()); | |
System.out.println("title = "+hit.getAbstractText()); | |
System.out.println("snippet = "+hit.getAbstractText()); | |
System.out.println("match = "+hit.getSource()); | |
} | |
return newHitList; | |
} | |
protected List<String[]> formTreeKernelStructure(String searchResultText, int count, List<HitBase> hits) { | |
List<String[]> treeBankBuffer = new ArrayList<String[]> (); | |
try { | |
// get the parses from original documents, and form the training dataset | |
ParseThicket pt = matcher.buildParseThicketFromTextWithRST(searchResultText); | |
List<Tree> forest = pt.getSentences(); | |
// if from the first half or ranked docs, then positive, otherwise negative | |
String posOrNeg = null; | |
if (count<hits.size()/2) | |
posOrNeg=" 1 "; | |
else | |
posOrNeg=" -1 "; | |
// form the list of training samples | |
for(Tree t: forest){ | |
treeBankBuffer.add(new String[] {posOrNeg+" |BT| "+t.toString()+ " |ET|"}); | |
} | |
} catch (Exception e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
return treeBankBuffer; | |
} | |
public static void main(String[] args){ | |
String query = " I see no meaningful distinction between complacency or complicity in the military's latest failure to uphold their own " + | |
"standards of conduct. Nor do I see a distinction between the service member who orchestrated this offense and the chain of " + | |
"command that was either oblivious to or tolerant of criminal behavior"; | |
query = "I am now living abroad and have health insurance from Russia. How can I avoid penalty for not having health insurance in US"; | |
MultiSentenceKernelBasedSearchResultsProcessor proc = new MultiSentenceKernelBasedSearchResultsProcessor(); | |
proc.setKernelPath("C:\\stanford-corenlp\\tree_kernel\\"); | |
proc.runSearchViaAPI(query); | |
} | |
} |