OPENNLP-537: make an access to generic search engines to demonstrate search results re-ranking
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
index e84b4fc..9e793b3 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
@@ -31,7 +31,8 @@
 import org.json.JSONObject;

 

 public class BingQueryRunner {

-  protected static final String APP_ID = "DD4E2A5DF8B7E5801ED443E47DC600D5F3E62713";

+  protected static final String APP_ID = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";

+    //"DD4E2A5DF8B7E5801ED443E47DC600D5F3E62713";

   // TODO user needs to have own APP_ID from Bing API

 

   private float snapshotSimilarityThreshold = 0.4f;

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java
deleted file mode 100644
index 1a6c3f2..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingSearchResultsScraper.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*

- * Licensed to the Apache Software Foundation (ASF) under one or more

- * contributor license agreements.  See the NOTICE file distributed with

- * this work for additional information regarding copyright ownership.

- * The ASF licenses this file to You under the Apache License, Version 2.0

- * (the "License"); you may not use this file except in compliance with

- * the License. You may obtain a copy of the License at

- *

- *     http://www.apache.org/licenses/LICENSE-2.0

- *

- * Unless required by applicable law or agreed to in writing, software

- * distributed under the License is distributed on an "AS IS" BASIS,

- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

- * See the License for the specific language governing permissions and

- * limitations under the License.

- */

-

-package opennlp.tools.similarity.apps;

-

-import java.io.BufferedReader;

-import java.io.InputStreamReader;

-import java.net.URL;

-import java.net.URLConnection;

-import java.util.ArrayList;

-import java.util.HashSet;

-import java.util.List;

-import java.util.Set;

-

-import org.apache.commons.lang.StringUtils;

-

-public class BingSearchResultsScraper {

-

-  protected static String fetchPageBing(String url) {

-    System.out.println("fetch url " + url);

-    String pageContent = null;

-    StringBuffer buf = new StringBuffer();

-    try {

-      URLConnection connection = new URL(url).openConnection();

-      connection.setReadTimeout(50000);

-      connection

-          .setRequestProperty(

-              "User-Agent",

-              "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");

-      String line;

-      BufferedReader reader = null;

-      try {

-        reader = new BufferedReader(new InputStreamReader(

-            connection.getInputStream()));

-      } catch (Exception e) {

-        e.printStackTrace();

-      }

-

-      while ((line = reader.readLine()) != null) {

-        buf.append(line);

-      }

-

-    } catch (Exception e) {

-      // e.printStackTrace();

-      System.err.println("error fetching url " + url);

-    }

-

-    return buf.toString();

-  }

-

-  private static List<String> extractURLesFromPage(String content, String domain) {

-    List<String> results = new ArrayList<String>();

-    if (content == null)

-      return results;

-    content = StringUtils.substringBetween(content, ">Advanced</a></div>",

-        "<input type=\"text\" value=");

-    if (content == null)

-      return results;

-    String[] urls = content.split("<cite>");

-    if (urls == null)

-      return results;

-    for (String u : urls) {

-      int endPos = u.indexOf("</cite>");

-

-      if (endPos > 0) {

-        u = u.substring(0, endPos).replace("</strong>", "")

-            .replace("<strong>", "");

-        if (!u.equals(domain))

-          results.add(u);

-      }

-    }

-

-    return results;

-  }

-

-  private static String formRequestURL(String seedURL) {

-    String requestUrl = "http://www.bing.com/search?q=site:" + seedURL;

-

-    return requestUrl;

-  }

-

-  public List<String> getURLsForWebDomain(String domain) {

-    return extractURLesFromPage(fetchPageBing(formRequestURL(domain)), domain);

-  }

-

-  public Set<String> getURLsForWebDomainIterations(String domain) {

-    List<String> results = new ArrayList<String>();

-    List<String> res = extractURLesFromPage(

-        fetchPageBing(formRequestURL(domain)), domain);

-    for (String r : res)

-      results.addAll(extractURLesFromPage(fetchPageBing(formRequestURL(r)), r));

-

-    return new HashSet<String>(results);

-  }

-

-  public static void main(String[] args) {

-    System.out.println(new BingSearchResultsScraper()

-        .getURLsForWebDomainIterations("www.sfgate.com/entertainment/"));

-  }

-

-}

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
index c4e2a3e..1b65034 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
@@ -42,13 +42,14 @@
       throws Exception {

     String codedQuery = URLEncoder.encode(query, "UTF-8");

 

-    String yahooRequest = "http://api.search.live.net/json.aspx?Appid="

-        + BingQueryRunner.APP_ID + "&query=" + codedQuery 

-        + "&Sources=Web"

+    String yahooRequest = "https://api.datamarket.azure.com/Bing/SearchWeb"

+     // "http://api.search.live.net/json.aspx?Appid="

+        + BingQueryRunner.APP_ID + "&Query=" + codedQuery ;

+      //  + "&Sources=Web"

         // Common request fields (optional)

-        + "&Version=2.0" + "&Market=en-us&web.count=" + numbOfHits

+       // + "&Version=2.0" + "&Market=en-us&web.count=" + numbOfHits

          // News-specific request fields (optional)

-        + "&News.Offset=0";

+      //  + "&News.Offset=0";

 

     return yahooRequest;

   }

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
index 9886807..f134e82 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SearchResultsProcessor.java
@@ -33,25 +33,20 @@
   ParserChunker2MatcherProcessor sm;

 

   /*

-   * Takes Bing API search results and calculates the parse tree similarity

+   * Takes a search engine API (or scraped) search results and calculates the parse tree similarity

    * between the question and each snippet. Ranks those snippets with higher

    * similarity score up

    */

-  private BingResponse calculateMatchScoreResortHits(BingResponse resp,

+  

+  

+  private List<HitBase> calculateMatchScoreResortHits(List<HitBase> hits,

       String searchQuery) {

-    // TODO

-    /*

-     * if query is multi-sentence, special handling int indexDot =

-     * searchQuery.indexOf("."); if (indexDot>0 &&

-     * indexDot<searchQuery.length()-1){ MultipleSentenceQueryAnswerer ans = new

-     * MultipleSentenceQueryAnswerer(); return

-     * ans.calculateMatchScoreResortHits(resp, searchQuery); }

-     */

+

     List<HitBase> newHitList = new ArrayList<HitBase>();

     sm = ParserChunker2MatcherProcessor.getInstance();

 

-    for (HitBase hit : resp.getHits()) {

-      String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ")

+    for (HitBase hit : hits) {

+      String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ").replace("<span class='best-phrase'>", " ").replace("<span>", " ").replace("<span>", " ")

           .replace("<b>", "").replace("</b>", "");

       snapshot = snapshot.replace("</B>", "").replace("<B>", "")

           .replace("<br>", "").replace("</br>", "").replace("...", ". ")

@@ -72,13 +67,13 @@
       newHitList.add(hit);

     }

     Collections.sort(newHitList, new HitBaseComparable());

-    resp.setHits(newHitList);

+   

     LOG.info("\n\n ============= NEW ORDER ================= ");

     for (HitBase hit : newHitList) {

       LOG.info(hit.toString());

     }

 

-    return resp;

+    return newHitList;

   }

 

   public void close() {

@@ -86,13 +81,21 @@
   }

 

   public List<HitBase> runSearch(String query) {

+    

+    WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();

+    List<HitBase> hits = scraper.runSearch(query);

+    hits = calculateMatchScoreResortHits(hits, query);

+    return hits;

+  }

+  

+  public List<HitBase> runSearchViaAPI(String query) {

     BingResponse resp = null, // obtained from bing

     newResp = null; // re-sorted based on similarity

     try {

       List<String> resultList = search(query, "", "", 30);

       resp = populateBingHit(resultList.get(0));

       // now we apply our own relevance filter

-      newResp = calculateMatchScoreResortHits(resp, query);

+      newResp.setHits(calculateMatchScoreResortHits(resp.getHits(), query));

 

     } catch (Exception e) {

       // e.printStackTrace();

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
index 48eb9fe..1959d64 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/SpeechRecognitionResultsProcessor.java
@@ -27,11 +27,12 @@
 import opennlp.tools.textsimilarity.SentencePairMatchResult;

 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

 

-public class SpeechRecognitionResultsProcessor extends BingWebQueryRunner {

+public class SpeechRecognitionResultsProcessor /*extends BingWebQueryRunner*/ {

   private static Logger LOG = Logger

       .getLogger("opennlp.tools.similarity.apps.SpeechRecognitionResultsProcessor");

   private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();

   ParserChunker2MatcherProcessor sm;

+  WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();

 

   /**

    * Gets an expression and tries to find it on the web. If search results are

@@ -40,19 +41,19 @@
    * results ate not similar to this phrase, we conclude that the phrase is

    * meaningless (does not make sense, nobody has ever said something like that)

    * 

-   * @param resp

-   *          BingResponse, search results for a phrase being assesses with

+   * @param  hits

+   *          list of search results for a phrase being assesses with

    *          respect to meaningfulness

    * @param searchQuery

    *          the phrase we are assessing

    * @return total similarity score for all search results

    */

-  private double calculateTotalMatchScoreForHits(BingResponse resp,

+  private double calculateTotalMatchScoreForHits(List<HitBase> hits,

       String searchQuery) {

 

     sm = ParserChunker2MatcherProcessor.getInstance();

     double totalMatchScore = 0;

-    for (HitBase hit : resp.getHits()) {

+    for (HitBase hit : hits) {

       String snapshot = hit.getAbstractText().replace("<b>...</b>", ". ")

           .replace("<b>", "").replace("</b>", "");

       snapshot = snapshot.replace("</B>", "").replace("<B>", "")

@@ -103,9 +104,8 @@
       BingResponse resp = null, // obtained from bing

       newResp = null; // re-sorted based on similarity

       try {

-        List<String> resultList = search(sentence, "", "", 10);

-        resp = populateBingHit(resultList.get(0));

-        double scoreForSentence = calculateTotalMatchScoreForHits(resp,

+        List<HitBase> resultList = scraper.runSearch(sentence);

+        double scoreForSentence = calculateTotalMatchScoreForHits(resultList,

             sentence);

         System.out.println("Total meaningfulness score = " + scoreForSentence

             + " for sentence = " + sentence);

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java
new file mode 100644
index 0000000..6e8244e
--- /dev/null
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/WebSearchEngineResultsScraper.java
@@ -0,0 +1,163 @@
+/*

+ * Licensed to the Apache Software Foundation (ASF) under one or more

+ * contributor license agreements.  See the NOTICE file distributed with

+ * this work for additional information regarding copyright ownership.

+ * The ASF licenses this file to You under the Apache License, Version 2.0

+ * (the "License"); you may not use this file except in compliance with

+ * the License. You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+package opennlp.tools.similarity.apps;

+

+import java.io.BufferedReader;

+import java.io.InputStreamReader;

+import java.net.URL;

+import java.net.URLConnection;

+import java.util.ArrayList;

+import java.util.HashSet;

+import java.util.List;

+import java.util.Set;

+

+import org.apache.commons.lang.StringUtils;

+

+public class WebSearchEngineResultsScraper {

+

+  protected static String fetchPageSearchEngine(String url) {

+    System.out.println("fetch url " + url);

+    String pageContent = null;

+    StringBuffer buf = new StringBuffer();

+    try {

+      URLConnection connection = new URL(url).openConnection();

+      connection.setReadTimeout(50000);

+      connection

+          .setRequestProperty(

+              "User-Agent",

+              "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3");

+      String line;

+      BufferedReader reader = null;

+      try {

+        reader = new BufferedReader(new InputStreamReader(

+            connection.getInputStream()));

+      } catch (Exception e) {

+        e.printStackTrace();

+      }

+

+      while ((line = reader.readLine()) != null) {

+        buf.append(line);

+      }

+

+    } catch (Exception e) {

+      // e.printStackTrace();

+      System.err.println("error fetching url " + url);

+    }

+

+    return buf.toString();

+  }

+

+  private static List<String> extractURLsFromPage(String content, String domain) {

+    List<String> results = new ArrayList<String>();

+    if (content == null)

+      return results;

+    content = StringUtils.substringBetween(content, ">Advanced</a></div>",

+        "<input type=\"text\" value=");

+    if (content == null)

+      return results;

+    String[] urls = content.split("<cite>");

+    if (urls == null)

+      return results;

+    for (String u : urls) {

+      int endPos = u.indexOf("</cite>");

+

+      if (endPos > 0) {

+        u = u.substring(0, endPos).replace("</strong>", "")

+            .replace("<strong>", "");

+        if (!u.equals(domain))

+          results.add(u);

+      }

+    }

+

+    return results;

+  }

+

+  private static List<HitBase> extractSearchResultFromPage(String content) {

+    List<HitBase> results = new ArrayList<HitBase>();

+    if (content == null)

+      return results;

+    content = StringUtils.substringBetween(content, "<div id=\"results",

+        "class=\"pagination");

+    if (content == null)

+      return results;

+    String[] srchResArea = content.split("</p>");

+    if (srchResArea == null)

+      return results;

+    for (String u : srchResArea) {

+      try {

+        u = u.substring(5);

+        HitBase hit = new HitBase();

+        String url = StringUtils.substringBetween(u, "class=\"url", "</span>");

+        if (url!=null)

+            url = url.substring(2);

+        String title = StringUtils.substringBetween(u, "\">", "</a><br />");

+        title = title.substring(title.indexOf("\">")+2);

+        String abstr = StringUtils.substringBetween(u, "\"body\">", "</span><br /");

+        hit.setUrl(url);

+        hit.setAbstractText(abstr);

+        hit.setTitle(title);

+        results.add(hit);

+      } catch (Exception e) {

+        //problem parsing SERP page; source - specific problem so we swallow exceptions here

+      }

+    }

+

+    return results;

+  }

+  

+  private static String formRequestURL(String query) {

+    String requestUrl = "http://www.hakia.com/search/web?q=" + query.replace(' ','+');

+

+    return requestUrl;

+  }

+

+  public List<String> getURLsForWebDomain(String domain) {

+    return extractURLsFromPage(fetchPageSearchEngine(formRequestURL(domain)), domain);

+  }

+

+  public Set<String> getURLsForWebDomainIterations(String domain) {

+    List<String> results = new ArrayList<String>();

+    List<String> res = extractURLsFromPage(

+        fetchPageSearchEngine(formRequestURL(domain)), domain);

+    for (String r : res)

+      results.addAll(extractURLsFromPage(fetchPageSearchEngine(formRequestURL(r)), r));

+

+    return new HashSet<String>(results);

+  }

+  

+  public List<HitBase> runSearch(String query) {

+    List<HitBase> hits = new ArrayList<HitBase>();

+    try {

+      String serp = fetchPageSearchEngine(formRequestURL(query));

+      hits = extractSearchResultFromPage(serp);

+

+    } catch (Exception e) {

+     

+      return hits;

+    }

+ 

+    hits = HitBase.removeDuplicates(hits);

+    return hits;

+  }

+

+  public static void main(String[] args) {

+    WebSearchEngineResultsScraper scraper = new WebSearchEngineResultsScraper();

+    System.out.println(scraper.runSearch("lady gaga in san francisco"));        

+  }

+

+}

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
index a209e8a..0bae33e 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/Pair.java
@@ -17,6 +17,10 @@
 
 package opennlp.tools.similarity.apps.utils;
 
+import java.util.Comparator;
+
+import opennlp.tools.similarity.apps.HitBase;
+
 /**
  * Generic pair class for holding two objects. Often used as return object.
  * 
@@ -54,4 +58,19 @@
   public void setSecond(T2 second) {
     this.second = second;
   }
+  
+  public class PairComparable implements Comparator<Pair<T1, T2>> {
+    // @Override
+    public int compare(Pair o1, Pair o2) {
+      int b = -2;
+      if ( o1.second instanceof Float && o2.second instanceof Float){
+        
+        b =  (((Float)o1.second > (Float)o2.second) ? -1
+          : (((Float)o1.second == (Float)o2.second) ? 0 : 1));
+      }
+      return b;
+    }
+  }
+  
 }
+
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java
index 4e8c195..f6da4de 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java
@@ -120,7 +120,8 @@
             + parseCacheFileNameCSV), ',');

         lines = reader.readAll();

       } catch (FileNotFoundException e) {

-        e.printStackTrace();

+        //e.printStackTrace();

+        System.err.println("Cannot find cache file");

         return null;

       } catch (IOException ioe) {

         ioe.printStackTrace();

diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
index 4b18cd0..8e5c5e3 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
@@ -117,8 +117,9 @@
       initializePosTagger();
       initializeParser();
       initializeChunker();
-    } catch (Exception e) {
-      LOG.fine("model cant be read and we rely on cache");
+    } catch (Exception e) { // a typical error when 'model' is not installed
+      System.err.println("Please install OpenNLP model files in 'src/test/resources' (folder 'model'");
+      LOG.fine("The model can't be read and we rely on cache");
     }
   }