blob: 477f022033ad042989c570343f1d4d701ede0cd0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.similarity.apps.solr;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import opennlp.tools.parse_thicket.apps.SnippetToParagraph;
import opennlp.tools.parse_thicket.matching.Matcher;
import opennlp.tools.similarity.apps.BingQueryRunner;
import opennlp.tools.similarity.apps.HitBase;
import opennlp.tools.similarity.apps.HitBaseComparable;
import opennlp.tools.similarity.apps.utils.Pair;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.CachingWrapperFilter;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.ScoreDoc;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.component.SearchHandler;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
public class SearchResultsReRankerStanfRequestHandler extends SearchHandler {
private static Logger LOG = Logger
.getLogger("com.become.search.requestHandlers.SearchResultsReRankerRequestHandler");
private final static int MAX_SEARCH_RESULTS = 100;
private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
private int MAX_QUERY_LENGTH_NOT_TO_RERANK=3;
private Matcher matcher = new Matcher();
private BingQueryRunner bingSearcher = new BingQueryRunner();
private SnippetToParagraph snp = new SnippetToParagraph();
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp){
// get query string
String requestExpression = req.getParamString();
String[] exprParts = requestExpression.split("&");
for(String part: exprParts){
if (part.startsWith("q="))
requestExpression = part;
}
String query = StringUtils.substringAfter(requestExpression, ":");
LOG.info(requestExpression);
SolrParams ps = req.getOriginalParams();
Iterator<String> iter = ps.getParameterNamesIterator();
List<String> keys = new ArrayList<String>();
while(iter.hasNext()){
keys.add(iter.next());
}
List<HitBase> searchResults = new ArrayList<HitBase>();
for ( Integer i=0; i< MAX_SEARCH_RESULTS; i++){
String title = req.getParams().get("t"+i.toString());
String descr = req.getParams().get("d"+i.toString());
if(title==null || descr==null)
continue;
HitBase hit = new HitBase();
hit.setTitle(title);
hit.setAbstractText(descr);
hit.setSource(i.toString());
searchResults.add(hit);
}
/*
* http://173.255.254.250:8983/solr/collection1/reranker/?
* q=search_keywords:design+iphone+cases&fields=spend+a+day+with+a+custom+iPhone+case&fields=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case&fields=Add+style+to+your+every+day+with+mobile+case+for+your+family&fields=Add+style+to+your+iPhone+and+iPad&fields=Add+Apple+fashion+to+your+iPhone+and+iPad
*
*/
if (searchResults.size()<1) {
int count=0;
for(String val : exprParts){
if (val.startsWith("fields=")){
val = StringUtils.mid(val, 7, val.length());
HitBase hit = new HitBase();
hit.setTitle("");
hit.setAbstractText(val);
hit.setSource(new Integer(count).toString());
searchResults.add(hit);
count++;
}
}
}
List<HitBase> reRankedResults = null;
query = query.replace('+', ' ');
if (tooFewKeywords(query)|| orQuery(query)){
reRankedResults = searchResults;
LOG.info("No re-ranking for "+query);
}
else
reRankedResults = calculateMatchScoreResortHits(searchResults, query);
/*
* <scores>
<score index="2">3.0005</score>
<score index="1">2.101</score>
<score index="3">2.1003333333333334</score>
<score index="4">2.00025</score>
<score index="5">1.1002</score>
</scores>
*
*
*/
StringBuffer buf = new StringBuffer();
buf.append("<scores>");
for(HitBase hit: reRankedResults){
buf.append("<score index=\""+hit.getSource()+"\">"+hit.getGenerWithQueryScore()+"</score>");
}
buf.append("</scores>");
NamedList<Object> scoreNum = new NamedList<Object>();
for(HitBase hit: reRankedResults){
scoreNum.add(hit.getSource(), hit.getGenerWithQueryScore());
}
StringBuffer bufNums = new StringBuffer();
bufNums.append("order>");
for(HitBase hit: reRankedResults){
bufNums.append(hit.getSource()+"_");
}
bufNums.append("/order>");
LOG.info("re-ranking results: "+buf.toString());
NamedList<Object> values = rsp.getValues();
values.remove("response");
values.add("response", scoreNum);
values.add("new_order", bufNums.toString().trim());
rsp.setAllValues(values);
}
private boolean orQuery(String query) {
if (query.indexOf('|')>-1)
return true;
return false;
}
private boolean tooFewKeywords(String query) {
String[] parts = query.split(" ");
if (parts!=null && parts.length< MAX_QUERY_LENGTH_NOT_TO_RERANK)
return true;
return false;
}
protected List<HitBase> calculateMatchScoreResortHits(List<HitBase> hits,
String searchQuery) {
List<HitBase> newHitList = new ArrayList<HitBase>();
int count = 0;
for (HitBase hit : hits) {
if (count>10)
break;
count++;
String[] pageSentsAndSnippet = formTextForReRankingFromHit(hit);
Double score = 0.0;
try {
List<List<ParseTreeChunk>> match = null;
if (pageSentsAndSnippet!=null && pageSentsAndSnippet[0].length()>50){
match = matcher.assessRelevanceCache(pageSentsAndSnippet[0] ,
searchQuery);
score = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
hit.setSource(match.toString());
}
if (score < 2){ // attempt to match with snippet, if not much luck with original text
match = matcher.assessRelevanceCache(pageSentsAndSnippet[0] ,
searchQuery);
score = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
}
LOG.info(score + " | " +pageSentsAndSnippet[1]);
} catch (Exception e) {
LOG.severe("Problem processing snapshot " + pageSentsAndSnippet[1]);
e.printStackTrace();
}
hit.setGenerWithQueryScore(score);
newHitList.add(hit);
}
System.out.println("\n\n ============= old ORDER ================= ");
for (HitBase hit : newHitList) {
System.out.println(hit.getOriginalSentences().toString() + " => "+hit.getGenerWithQueryScore());
System.out.println("match = "+hit.getSource());
}
Collections.sort(newHitList, new HitBaseComparable());
System.out.println("\n\n ============= NEW ORDER ================= ");
for (HitBase hit : newHitList) {
System.out.println(hit.getOriginalSentences().toString() + " => "+hit.getGenerWithQueryScore());
System.out.println("match = "+hit.getSource());
}
return newHitList;
}
protected String[] formTextForReRankingFromHit(HitBase hit) {
HitBase hitWithFullSents = snp.formTextFromOriginalPageGivenSnippet(hit);
String textFromOriginalPage = "";
try {
List<String> sents = hitWithFullSents.getOriginalSentences();
for(String s: sents){
textFromOriginalPage+=s+" ";
}
if (textFromOriginalPage.startsWith(".")){
textFromOriginalPage = textFromOriginalPage.substring(2);
}
textFromOriginalPage = textFromOriginalPage.replace(" . .", ". ").replace(". . ", ". ").
replace("..", ". ").trim();
} catch (Exception e1) {
e1.printStackTrace();
LOG.info("Problem processing snapshot "+hit.getAbstractText());
}
hit.setPageContent(textFromOriginalPage);
String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ").replace("<span class='best-phrase'>", " ").replace("<span>", " ").replace("<span>", " ")
.replace("<b>", "").replace("</b>", "");
snapshot = snapshot.replace("</B>", "").replace("<B>", "")
.replace("<br>", "").replace("</br>", "").replace("...", ". ")
.replace("|", " ").replace(">", " ").replace(". .", ". ");
snapshot += " . " + hit.getTitle();
return new String[] { textFromOriginalPage, snapshot };
}
public class HitBaseComparable implements Comparator<HitBase> {
// @Override
public int compare(HitBase o1, HitBase o2) {
return (o1.getGenerWithQueryScore() > o2.getGenerWithQueryScore() ? -1
: (o1 == o2 ? 0 : 1));
}
}
}
/*
http://dev1.exava.us:8086/solr/collection1/reranker/?q=search_keywords:I+want+style+in+my+every+day+fresh+design+iphone+cases
&t1=Personalized+iPhone+4+Cases&d1=spend+a+day+with+a+custom+iPhone+case
&t2=iPhone+Cases+to+spend+a+day&d2=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case
&t3=Plastic+iPhone+Cases&d3=Add+style+to+your+every+day+with+mobile+case+for+your+family
&t4=Personalized+iPhone+and+iPad+Cases&d4=Add+style+to+your+iPhone+and+iPad
&t5=iPhone+accessories+from+Apple&d5=Add+Apple+fashion+to+your+iPhone+and+iPad
http://dev1.exava.us:8086/solr/collection1/reranker/?q=search_keywords:I+want+style+in+my+every+day+fresh+design+iphone+cases&t1=Personalized+iPhone+4+Cases&d1=spend+a+day+with+a+custom+iPhone+case&t2=iPhone+Cases+to+spend+a+day&d2=Add+style+to+your+every+day+fresh+design+with+a+custom+iPhone+case&t3=Plastic+iPhone+Cases&d3=Add+style+to+your+every+day+with+mobile+case+for+your+family&t4=Personalized+iPhone+and+iPad+Cases&d4=Add+style+to+your+iPhone+and+iPad&t5=iPhone+accessories+from+Apple&d5=Add+Apple+fashion+to+your+iPhone+and+iPad
*/