OPENNLP-537: make an access to generic search engines to demonstrate search results re-ranking
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
index e84b4fc..9e793b3 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
@@ -31,7 +31,8 @@
import org.json.JSONObject;
public class BingQueryRunner {
- protected static final String APP_ID = "DD4E2A5DF8B7E5801ED443E47DC600D5F3E62713";
+ protected static final String APP_ID = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
+ //"DD4E2A5DF8B7E5801ED443E47DC600D5F3E62713";
// TODO user needs to have own APP_ID from Bing API
private float snapshotSimilarityThreshold = 0.4f;
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java
deleted file mode 100644
index 1a6c3f2..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.similarity.apps;
-
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.net.URL;
-import java.net.URLConnection;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.commons.lang.StringUtils;
-
-public class BingSearchResultsScraper {
-
- protected static String fetchPageBing(String url) {
- System.out.println("fetch url " + url);
- String pageContent = null;
- StringBuffer buf = new StringBuffer();
- try {
- URLConnection connection = new URL(url).openConnection();
- connection.setReadTimeout(50000);
- connection
- .setRequestProperty(
- "User-Agent",
- "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");
- String line;
- BufferedReader reader = null;
- try {
- reader = new BufferedReader(new InputStreamReader(
- connection.getInputStream()));
- } catch (Exception e) {
- e.printStackTrace();
- }
-
- while ((line = reader.readLine()) != null) {
- buf.append(line);
- }
-
- } catch (Exception e) {
- // e.printStackTrace();
- System.err.println("error fetching url " + url);
- }
-
- return buf.toString();
- }
-
- private static List<String> extractURLesFromPage(String content, String domain) {
- List<String> results = new ArrayList<String>();
- if (content == null)
- return results;
- content = StringUtils.substringBetween(content, ">Advanced</a></div>",
- "<input type=\"text\" value=");
- if (content == null)
- return results;
- String[] urls = content.split("<cite>");
- if (urls == null)
- return results;
- for (String u : urls) {
- int endPos = u.indexOf("</cite>");
-
- if (endPos > 0) {
- u = u.substring(0, endPos).replace("</strong>", "")
- .replace("<strong>", "");
- if (!u.equals(domain))
- results.add(u);
- }
- }
-
- return results;
- }
-
- private static String formRequestURL(String seedURL) {
- String requestUrl = "http://www.bing.com/search?q=site:" + seedURL;
-
- return requestUrl;
- }
-
- public List<String> getURLsForWebDomain(String domain) {
- return extractURLesFromPage(fetchPageBing(formRequestURL(domain)), domain);
- }
-
- public Set<String> getURLsForWebDomainIterations(String domain) {
- List<String> results = new ArrayList<String>();
- List<String> res = extractURLesFromPage(
- fetchPageBing(formRequestURL(domain)), domain);
- for (String r : res)
- results.addAll(extractURLesFromPage(fetchPageBing(formRequestURL(r)), r));
-
- return new HashSet<String>(results);
- }
-
- public static void main(String[] args) {
- System.out.println(new BingSearchResultsScraper()
- .getURLsForWebDomainIterations("www.sfgate.com/entertainment/"));
- }
-
-}
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
index c4e2a3e..1b65034 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
@@ -42,13 +42,14 @@
throws Exception {
String codedQuery = URLEncoder.encode(query, "UTF-8");
- String yahooRequest = "http://api.search.live.net/json.aspx?Appid="
- + BingQueryRunner.APP_ID + "&query=" + codedQuery
- + "&Sources=Web"
+ String yahooRequest = "https://api.datamarket.azure.com/Bing/SearchWeb"
+ // "http://api.search.live.net/json.aspx?Appid="
+ + BingQueryRunner.APP_ID + "&Query=" + codedQuery ;
+ // + "&Sources=Web"
// Common request fields (optional)
- + "&Version=2.0" + "&Market=en-us&web.count=" + numbOfHits
+ // + "&Version=2.0" + "&Market=en-us&web.count=" + numbOfHits
// News-specific request fields (optional)
- + "&News.Offset=0";
+ // + "&News.Offset=0";
return yahooRequest;
}
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
index 9886807..f134e82 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
@@ -33,25 +33,20 @@
ParserChunker2MatcherProcessor sm;
/*
- * Takes Bing API search results and calculates the parse tree similarity
+ * Takes a search engine API (or scraped) search results and calculates the parse tree similarity
* between the question and each snippet. Ranks those snippets with higher
* similarity score up
*/
- private BingResponse calculateMatchScoreResortHits(BingResponse resp,
+
+
+ private List<HitBase> calculateMatchScoreResortHits(List<HitBase> hits,
String searchQuery) {
- // TODO
- /*
- * if query is multi-sentence, special handling int indexDot =
- * searchQuery.indexOf("."); if (indexDot>0 &&
- * indexDot<searchQuery.length()-1){ MultipleSentenceQueryAnswerer ans = new
- * MultipleSentenceQueryAnswerer(); return
- * ans.calculateMatchScoreResortHits(resp, searchQuery); }
- */
+
List<HitBase> newHitList = new ArrayList<HitBase>();
sm = ParserChunker2MatcherProcessor.getInstance();
- for (HitBase hit : resp.getHits()) {
- String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ")
+ for (HitBase hit : hits) {
+ String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ").replace("<span class='best-phrase'>", " ").replace("<span>", " ").replace("<span>", " ")
.replace("<b>", "").replace("</b>", "");
snapshot = snapshot.replace("</B>", "").replace("<B>", "")
.replace("<br>", "").replace("</br>", "").replace("...", ". ")
@@ -72,13 +67,13 @@
newHitList.add(hit);
}
Collections.sort(newHitList, new HitBaseComparable());
- resp.setHits(newHitList);
+
LOG.info("\n\n ============= NEW ORDER ================= ");
for (HitBase hit : newHitList) {
LOG.info(hit.toString());
}
- return resp;
+ return newHitList;
}
public void close() {
@@ -86,13 +81,21 @@
}
public List<HitBase> runSearch(String query) {
+
+ WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();
+ List<HitBase> hits = scraper.runSearch(query);
+ hits = calculateMatchScoreResortHits(hits, query);
+ return hits;
+ }
+
+ public List<HitBase> runSearchViaAPI(String query) {
BingResponse resp = null, // obtained from bing
newResp = null; // re-sorted based on similarity
try {
List<String> resultList = search(query, "", "", 30);
resp = populateBingHit(resultList.get(0));
// now we apply our own relevance filter
- newResp = calculateMatchScoreResortHits(resp, query);
+ newResp.setHits(calculateMatchScoreResortHits(resp.getHits(), query));
} catch (Exception e) {
// e.printStackTrace();
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
index 48eb9fe..1959d64 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
@@ -27,11 +27,12 @@
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;
-public class SpeechRecognitionResultsProcessor extends BingWebQueryRunner {
+public class SpeechRecognitionResultsProcessor /*extends BingWebQueryRunner*/ {
private static Logger LOG = Logger
.getLogger("opennlp.tools.similarity.apps.SpeechRecognitionResultsProcessor");
private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
ParserChunker2MatcherProcessor sm;
+ WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();
/**
* Gets an expression and tries to find it on the web. If search results are
@@ -40,19 +41,19 @@
* results ate not similar to this phrase, we conclude that the phrase is
* meaningless (does not make sense, nobody has ever said something like that)
*
- * @param resp
- * BingResponse, search results for a phrase being assesses with
+ * @param hits
+ * list of search results for a phrase being assesses with
* respect to meaningfulness
* @param searchQuery
* the phrase we are assessing
* @return total similarity score for all search results
*/
- private double calculateTotalMatchScoreForHits(BingResponse resp,
+ private double calculateTotalMatchScoreForHits(List<HitBase> hits,
String searchQuery) {
sm = ParserChunker2MatcherProcessor.getInstance();
double totalMatchScore = 0;
- for (HitBase hit : resp.getHits()) {
+ for (HitBase hit : hits) {
String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ")
.replace("<b>", "").replace("</b>", "");
snapshot = snapshot.replace("</B>", "").replace("<B>", "")
@@ -103,9 +104,8 @@
BingResponse resp = null, // obtained from bing
newResp = null; // re-sorted based on similarity
try {
- List<String> resultList = search(sentence, "", "", 10);
- resp = populateBingHit(resultList.get(0));
- double scoreForSentence = calculateTotalMatchScoreForHits(resp,
+ List<HitBase> resultList = scraper.runSearch(sentence);
+ double scoreForSentence = calculateTotalMatchScoreForHits(resultList,
sentence);
System.out.println("Total meaningfulness score = " + scoreForSentence
+ " for sentence = " + sentence);
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java
new file mode 100644
index 0000000..6e8244e
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.similarity.apps;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.URLConnection;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+
+public class WebSearchEngineResultsScraper {
+
+ protected static String fetchPageSearchEngine(String url) {
+ System.out.println("fetch url " + url);
+ String pageContent = null;
+ StringBuffer buf = new StringBuffer();
+ try {
+ URLConnection connection = new URL(url).openConnection();
+ connection.setReadTimeout(50000);
+ connection
+ .setRequestProperty(
+ "User-Agent",
+ "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");
+ String line;
+ BufferedReader reader = null;
+ try {
+ reader = new BufferedReader(new InputStreamReader(
+ connection.getInputStream()));
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ while ((line = reader.readLine()) != null) {
+ buf.append(line);
+ }
+
+ } catch (Exception e) {
+ // e.printStackTrace();
+ System.err.println("error fetching url " + url);
+ }
+
+ return buf.toString();
+ }
+
+ private static List<String> extractURLsFromPage(String content, String domain) {
+ List<String> results = new ArrayList<String>();
+ if (content == null)
+ return results;
+ content = StringUtils.substringBetween(content, ">Advanced</a></div>",
+ "<input type=\"text\" value=");
+ if (content == null)
+ return results;
+ String[] urls = content.split("<cite>");
+ if (urls == null)
+ return results;
+ for (String u : urls) {
+ int endPos = u.indexOf("</cite>");
+
+ if (endPos > 0) {
+ u = u.substring(0, endPos).replace("</strong>", "")
+ .replace("<strong>", "");
+ if (!u.equals(domain))
+ results.add(u);
+ }
+ }
+
+ return results;
+ }
+
+ private static List<HitBase> extractSearchResultFromPage(String content) {
+ List<HitBase> results = new ArrayList<HitBase>();
+ if (content == null)
+ return results;
+ content = StringUtils.substringBetween(content, "<div id=\"results",
+ "class=\"pagination");
+ if (content == null)
+ return results;
+ String[] srchResArea = content.split("</p>");
+ if (srchResArea == null)
+ return results;
+ for (String u : srchResArea) {
+ try {
+ u = u.substring(5);
+ HitBase hit = new HitBase();
+ String url = StringUtils.substringBetween(u, "class=\"url", "</span>");
+ if (url!=null)
+ url = url.substring(2);
+ String title = StringUtils.substringBetween(u, "\">", "</a><br />");
+ title = title.substring(title.indexOf("\">")+2);
+ String abstr = StringUtils.substringBetween(u, "\"body\">", "</span><br /");
+ hit.setUrl(url);
+ hit.setAbstractText(abstr);
+ hit.setTitle(title);
+ results.add(hit);
+ } catch (Exception e) {
+ //problem parsing SERP page; source - specific problem so we swallow exceptions here
+ }
+ }
+
+ return results;
+ }
+
+ private static String formRequestURL(String query) {
+ String requestUrl = "http://www.hakia.com/search/web?q=" + query.replace(' ','+');
+
+ return requestUrl;
+ }
+
+ public List<String> getURLsForWebDomain(String domain) {
+ return extractURLsFromPage(fetchPageSearchEngine(formRequestURL(domain)), domain);
+ }
+
+ public Set<String> getURLsForWebDomainIterations(String domain) {
+ List<String> results = new ArrayList<String>();
+ List<String> res = extractURLsFromPage(
+ fetchPageSearchEngine(formRequestURL(domain)), domain);
+ for (String r : res)
+ results.addAll(extractURLsFromPage(fetchPageSearchEngine(formRequestURL(r)), r));
+
+ return new HashSet<String>(results);
+ }
+
+ public List<HitBase> runSearch(String query) {
+ List<HitBase> hits = new ArrayList<HitBase>();
+ try {
+ String serp = fetchPageSearchEngine(formRequestURL(query));
+ hits = extractSearchResultFromPage(serp);
+
+ } catch (Exception e) {
+
+ return hits;
+ }
+
+ hits = HitBase.removeDuplicates(hits);
+ return hits;
+ }
+
+ public static void main(String[] args) {
+ WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();
+ System.out.println(scraper.runSearch("lady gaga in san francisco"));
+ }
+
+}
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
index a209e8a..0bae33e 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
@@ -17,6 +17,10 @@
package opennlp.tools.similarity.apps.utils;
+import java.util.Comparator;
+
+import opennlp.tools.similarity.apps.HitBase;
+
/**
* Generic pair class for holding two objects. Often used as return object.
*
@@ -54,4 +58,19 @@
public void setSecond(T2 second) {
this.second = second;
}
+
+ public class PairComparable implements Comparator<Pair<T1, T2>> {
+ // @Override
+ public int compare(Pair o1, Pair o2) {
+ int b = -2;
+ if ( o1.second instanceof Float && o2.second instanceof Float){
+
+ b = (((Float)o1.second > (Float)o2.second) ? -1
+ : (((Float)o1.second == (Float)o2.second) ? 0 : 1));
+ }
+ return b;
+ }
+ }
+
}
+
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java
index 4e8c195..f6da4de 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java
@@ -120,7 +120,8 @@
+ parseCacheFileNameCSV), ',');
lines = reader.readAll();
} catch (FileNotFoundException e) {
- e.printStackTrace();
+ //e.printStackTrace();
+ System.err.println("Cannot find cache file");
return null;
} catch (IOException ioe) {
ioe.printStackTrace();
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
index 4b18cd0..8e5c5e3 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
@@ -117,8 +117,9 @@
initializePosTagger();
initializeParser();
initializeChunker();
- } catch (Exception e) {
- LOG.fine("model cant be read and we rely on cache");
+ } catch (Exception e) { // a typical error when 'model' is not installed
+ System.err.println("Please install OpenNLP model files in 'src/test/resources' (folder 'model'");
+ LOG.fine("The model can't be read and we rely on cache");
}
}