bing api
diff --git a/opennlp-similarity/pom.xml b/opennlp-similarity/pom.xml
index b7e7bd0..35b768b 100644
--- a/opennlp-similarity/pom.xml
+++ b/opennlp-similarity/pom.xml
@@ -44,6 +44,14 @@
<prerequisites>
<maven>3.0</maven>
</prerequisites>
+
+ <repositories>
+ <repository>
+ <id>net.billylieurance</id>
+ <name>BillyLieuranceNet</name>
+ <url>http://www.billylieurance.net/maven2</url>
+ </repository>
+ </repositories>
<dependencies>
<dependency>
@@ -58,7 +66,7 @@
<version>4.8.1</version>
<scope>test</scope>
</dependency>
- <dependency>
+ <dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.5</version>
@@ -91,6 +99,46 @@
<artifactId>solr-core</artifactId>
<version>4.0.0-BETA</version>
</dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>1.7</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>1.1.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient</artifactId>
+ <version>4.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient-cache</artifactId>
+ <version>4.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpcore</artifactId>
+ <version>4.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpmime</artifactId>
+ <version>4.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>fluent-hc</artifactId>
+ <version>4.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>net.billylieurance.azuresearch</groupId>
+ <artifactId>azure-bing-search-java</artifactId>
+ <version>0.11.0</version>
+ </dependency>
</dependencies>
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
index 1b65034..a934264 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
@@ -28,128 +28,41 @@
import java.util.List;
import java.util.logging.Logger;
+import net.billylieurance.azuresearch.AzureSearchResultSet;
+import net.billylieurance.azuresearch.AzureSearchWebQuery;
+import net.billylieurance.azuresearch.AzureSearchWebResult;
import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
import org.apache.commons.lang.StringUtils;
import org.json.JSONArray;
import org.json.JSONObject;
+
public class BingWebQueryRunner {
private static final Logger LOG = Logger
.getLogger("opennlp.tools.similarity.apps.BingWebQueryRunner");
-
- private String constructBingWebUrl(String query, int numbOfHits)
- throws Exception {
- String codedQuery = URLEncoder.encode(query, "UTF-8");
-
- String yahooRequest = "https://api.datamarket.azure.com/Bing/SearchWeb"
- // "http://api.search.live.net/json.aspx?Appid="
- + BingQueryRunner.APP_ID + "&Query=" + codedQuery ;
- // + "&Sources=Web"
- // Common request fields (optional)
- // + "&Version=2.0" + "&Market=en-us&web.count=" + numbOfHits
- // News-specific request fields (optional)
- // + "&News.Offset=0";
-
- return yahooRequest;
- }
-
- public BingResponse populateBingHit(String response) throws Exception {
- BingResponse resp = new BingResponse();
- JSONObject rootObject = new JSONObject(response);
- // each response is object that under the key of "ysearchresponse"
- JSONObject responseObject = rootObject.getJSONObject("SearchResponse");
- JSONObject web = responseObject.getJSONObject("Web"); // "News"
-
- // the search result is in an array under the name of "results"
- JSONArray resultSet = null;
- try {
- resultSet = web.getJSONArray("Results");
- int count = (int) web.getLong("Total");
- resp.setTotalHits(new Integer(count));
- } catch (Exception e) {
- e.printStackTrace();
- LOG.severe("\nNo search results " + e);
-
- }
- if (resultSet != null) {
- for (int i = 0; i < resultSet.length(); i++) {
- try {
- HitBase hit = new HitBase();
- JSONObject singleResult = resultSet.getJSONObject(i);
- hit.setAbstractText(singleResult.getString("Description"));
- hit.setDate(singleResult.getString("DateTime"));
- String title = StringUtils.replace(singleResult.getString("Title"),
- "", " ");
- hit.setTitle(title);
- hit.setUrl(singleResult.getString("Url"));
-
- resp.appendHits(hit);
- } catch (Exception e) {
- // incomplete search result: do not through exception
- }
- }
- }
- return resp;
- }
-
- public ArrayList<String> search(String query, String domainWeb, String lang,
- int numbOfHits) throws Exception {
- URL url = new URL(constructBingWebUrl(query, numbOfHits));
- URLConnection connection = url.openConnection();
-
- String line;
- ArrayList<String> result = new ArrayList<String>();
- BufferedReader reader = new BufferedReader(new InputStreamReader(
- connection.getInputStream()));
- int count = 0;
- while ((line = reader.readLine()) != null) {
- result.add(line);
- count++;
- }
- return result;
- }
-
- public List<HitBase> runSearch(String query) {
- BingResponse resp = null;
- try {
- List<String> resultList = search(query, "", "", 8);
- resp = populateBingHit(resultList.get(0));
-
- } catch (Exception e) {
- // e.printStackTrace();
- LOG.info("No news search results for query " + query);
- return null;
- }
- // cast to super class
- List<HitBase> hits = new ArrayList<HitBase>();
- for (HitBase h : resp.getHits())
- hits.add((HitBase) h);
-
- hits = removeDuplicates(hits, 0.9);
-
- return hits;
- }
-
- public List<HitBase> runSearch(String query, int num) {
- BingResponse resp = null;
- try {
- List<String> resultList = search(query, "", "", num);
- resp = populateBingHit(resultList.get(0));
-
- } catch (Exception e) {
- // e.printStackTrace();
- LOG.info("No news search results for query " + query);
- return null;
- }
- // cast to super class
- List<HitBase> hits = new ArrayList<HitBase>();
- for (HitBase h : resp.getHits())
- hits.add((HitBase) h);
-
- hits = removeDuplicates(hits, 0.9);
- return hits;
- }
+ public static final String BING_KEY = "TyfmF/4t1qbnA5X6sBXiTf80l29cSn+7IT0fPw2FNsU=";
+ private AzureSearchWebQuery aq = new AzureSearchWebQuery();
+
+ public List<HitBase> runSearch(String query, int nRes) {
+ aq.setAppid(BING_KEY);
+ aq.setQuery(query);
+ aq.doQuery();
+
+ List<HitBase> results = new ArrayList<HitBase> ();
+ AzureSearchResultSet<AzureSearchWebResult> ars = aq.getQueryResult();
+
+ for (AzureSearchWebResult anr : ars){
+ HitBase h = new HitBase();
+ h.setAbstractText(anr.getDescription());
+ h.setTitle(anr.getTitle());
+ h.setUrl(anr.getUrl());
+ results.add(h);
+ results = removeDuplicates(results, 0.9);
+ }
+ return results;
+}
+
public static List<HitBase> removeDuplicates(List<HitBase> hits,
double imageDupeThresh) {
@@ -185,10 +98,10 @@
}
public int getTotalPagesAtASite(String site) {
- BingResponse resp = null;
+
try {
- List<String> resultList = search("site:" + site, "", "", 10);
- resp = populateBingHit(resultList.get(0));
+ List<HitBase> resultList = runSearch("site:" + site, 10);
+
} catch (Exception e) {
// e.printStackTrace();
@@ -196,11 +109,8 @@
return 0;
}
- return resp.totalHits;
+ return 0;
}
- public static void main(String[] args) {
- int res = new BingWebQueryRunner().getTotalPagesAtASite("www.zvents.com");
- new BingWebQueryRunner().runSearch("site:www.tripadvisor.com", 10);
- };
+
}
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
index 47e0d04..f47bde6 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
@@ -59,27 +59,35 @@
|| sentTry.indexOf("clicking here") > -1
|| sentTry.indexOf("skip to") > -1 || sentTry.indexOf("sidebar") > -1
|| sentTry.indexOf("Tags:") > -1 || sentTry.startsWith("Posted by")
- || sentTry.indexOf("available online") > 0
- || sentTry.indexOf("get online") > 0
- || sentTry.indexOf("buy online") > 0
- || sentTry.indexOf("not valid") > 0 || sentTry.indexOf("discount") > 0
- || sentTry.indexOf("official site") > 0
- || sentTry.indexOf("this video") > 0
- || sentTry.indexOf("this book") > 0
- || sentTry.indexOf("this product") > 0
- || sentTry.indexOf("paperback") > 0 || sentTry.indexOf("hardcover") > 0
- || sentTry.indexOf("audio cd") > 0
- || sentTry.indexOf("related searches") > 0
- || sentTry.indexOf("permission is granted") > 0
- || sentTry.indexOf("[edit") > 0
- || sentTry.indexOf("edit categories") > 0
- || sentTry.indexOf("free license") > 0
- || sentTry.indexOf("permission is granted") > 0
- || sentTry.indexOf("under the terms") > 0
- || sentTry.indexOf("rights reserved") > 0
- || sentTry.indexOf("wikipedia") > 0 || sentTry.endsWith("the")
- || sentTry.endsWith("the.") || sentTry.startsWith("below")
-
+ || sentTry.indexOf("available online") > -1
+ || sentTry.indexOf("get online") > -1
+ || sentTry.indexOf("buy online") > -1
+ || sentTry.indexOf("not valid") > -1 || sentTry.indexOf("discount") > -1
+ || sentTry.indexOf("official site") > -1
+ || sentTry.indexOf("this video") > -1
+ || sentTry.indexOf("this book") > -1
+ || sentTry.indexOf("this product") > -1
+ || sentTry.indexOf("paperback") > -1 || sentTry.indexOf("hardcover") > -1
+ || sentTry.indexOf("audio cd") > -1
+ || sentTry.indexOf("related searches") > -1
+ || sentTry.indexOf("permission is granted") > -1
+ || sentTry.indexOf("[edit") > -1
+ || sentTry.indexOf("edit categories") > -1
+ || sentTry.indexOf("free license") > -1
+ || sentTry.indexOf("permission is granted") > -1
+ || sentTry.indexOf("under the terms") > -1
+ || sentTry.indexOf("rights reserved") > -1
+ || sentTry.indexOf("wikipedia") > -1 || sentTry.endsWith("the")
+ || sentTry.endsWith("the.") || sentTry.startsWith("below")
+ || sentTry.indexOf("recipient of")>-1 || sentTry.indexOf("this message")>-1
+ ||sentTry.indexOf( "mailing list")>-1 ||sentTry.indexOf( "purchase order")>-1
+ ||sentTry.indexOf( "mon-fri")>-1 ||sentTry.indexOf( "email us")>-1 ||sentTry.indexOf( "privacy pol")>-1 ||sentTry.indexOf( "back to top")>-1
+ ||sentTry.indexOf( "click here")>-1 ||sentTry.indexOf( "for details")>-1 ||sentTry.indexOf( "assistance?")>-1 ||sentTry.indexOf( "chat live")>-1
+ ||sentTry.indexOf( "free shipping")>-1 ||sentTry.indexOf( "company info")>-1 ||sentTry.indexOf( "satisfaction g")>-1 ||sentTry.indexOf( "contact us")>-1
+
+ ||sentTry.startsWith( "fax") ||sentTry.startsWith( "write") || sentTry.startsWith( "email")||sentTry.indexOf( "conditions")>-1 ||sentTry.indexOf( "chat live")>-1
+ ||sentTry.indexOf( "free shipping")>-1 ||sentTry.indexOf( "company info")>-1 ||sentTry.indexOf( "satisfaction g")>-1 ||sentTry.indexOf( "contact us")>-1
+
)
return null;
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java
index f6da4de..40096c3 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserCacheSerializer.java
@@ -120,8 +120,8 @@
+ parseCacheFileNameCSV), ',');
lines = reader.readAll();
} catch (FileNotFoundException e) {
- //e.printStackTrace();
- System.err.println("Cannot find cache file");
+ if (javaObjectSerialization)
+ System.err.println("Cannot find cache file");
return null;
} catch (IOException ioe) {
ioe.printStackTrace();
diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
index bd03628..a4aa734 100644
--- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
+++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
@@ -58,8 +58,7 @@
private static final String MODEL_DIR_KEY = "nlp.models.dir";
// TODO config
// this is where resources should live
- private static String MODEL_DIR;
- public static String MODEL_DIR_REL = "src/test/resources/models";
+ private static String MODEL_DIR=null, MODEL_DIR_REL = "src/test/resources/models";
protected static ParserChunker2MatcherProcessor instance;
private SentenceDetector sentenceDetector;
@@ -110,8 +109,10 @@
sentence_parseObject = new HashMap<String, String[][]>();
try {
- MODEL_DIR = new File(".").getAbsolutePath().replace(".", "")
- + MODEL_DIR_REL;
+ if (MODEL_DIR==null)
+ MODEL_DIR = new File(".").getAbsolutePath().replace(".", "") + MODEL_DIR_REL;
+ //get full path from constructor
+
initializeSentenceDetector();
initializeTokenizer();
initializePosTagger();
@@ -141,6 +142,14 @@
return instance;
}
+
+ public synchronized static ParserChunker2MatcherProcessor getInstance(String fullPathToResources) {
+ MODEL_DIR = fullPathToResources+"/models";
+ if (instance == null)
+ instance = new ParserChunker2MatcherProcessor();
+
+ return instance;
+ }
/**
* General parsing function, which returns lists of parses for a portion of