blob: c4e2a3e29f62fad7124108c42ef0ab00cad0a30e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.similarity.apps;
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;
import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
import org.apache.commons.lang.StringUtils;
import org.json.JSONArray;
import org.json.JSONObject;
public class BingWebQueryRunner {
private static final Logger LOG = Logger
.getLogger("opennlp.tools.similarity.apps.BingWebQueryRunner");
private String constructBingWebUrl(String query, int numbOfHits)
throws Exception {
String codedQuery = URLEncoder.encode(query, "UTF-8");
String yahooRequest = "http://api.search.live.net/json.aspx?Appid="
+ BingQueryRunner.APP_ID + "&query=" + codedQuery
+ "&Sources=Web"
// Common request fields (optional)
+ "&Version=2.0" + "&Market=en-us&web.count=" + numbOfHits
// News-specific request fields (optional)
+ "&News.Offset=0";
return yahooRequest;
}
public BingResponse populateBingHit(String response) throws Exception {
BingResponse resp = new BingResponse();
JSONObject rootObject = new JSONObject(response);
// each response is object that under the key of "ysearchresponse"
JSONObject responseObject = rootObject.getJSONObject("SearchResponse");
JSONObject web = responseObject.getJSONObject("Web"); // "News"
// the search result is in an array under the name of "results"
JSONArray resultSet = null;
try {
resultSet = web.getJSONArray("Results");
int count = (int) web.getLong("Total");
resp.setTotalHits(new Integer(count));
} catch (Exception e) {
e.printStackTrace();
LOG.severe("\nNo search results " + e);
}
if (resultSet != null) {
for (int i = 0; i < resultSet.length(); i++) {
try {
HitBase hit = new HitBase();
JSONObject singleResult = resultSet.getJSONObject(i);
hit.setAbstractText(singleResult.getString("Description"));
hit.setDate(singleResult.getString("DateTime"));
String title = StringUtils.replace(singleResult.getString("Title"),
"", " ");
hit.setTitle(title);
hit.setUrl(singleResult.getString("Url"));
resp.appendHits(hit);
} catch (Exception e) {
// incomplete search result: do not through exception
}
}
}
return resp;
}
public ArrayList<String> search(String query, String domainWeb, String lang,
int numbOfHits) throws Exception {
URL url = new URL(constructBingWebUrl(query, numbOfHits));
URLConnection connection = url.openConnection();
String line;
ArrayList<String> result = new ArrayList<String>();
BufferedReader reader = new BufferedReader(new InputStreamReader(
connection.getInputStream()));
int count = 0;
while ((line = reader.readLine()) != null) {
result.add(line);
count++;
}
return result;
}
public List<HitBase> runSearch(String query) {
BingResponse resp = null;
try {
List<String> resultList = search(query, "", "", 8);
resp = populateBingHit(resultList.get(0));
} catch (Exception e) {
// e.printStackTrace();
LOG.info("No news search results for query " + query);
return null;
}
// cast to super class
List<HitBase> hits = new ArrayList<HitBase>();
for (HitBase h : resp.getHits())
hits.add((HitBase) h);
hits = removeDuplicates(hits, 0.9);
return hits;
}
public List<HitBase> runSearch(String query, int num) {
BingResponse resp = null;
try {
List<String> resultList = search(query, "", "", num);
resp = populateBingHit(resultList.get(0));
} catch (Exception e) {
// e.printStackTrace();
LOG.info("No news search results for query " + query);
return null;
}
// cast to super class
List<HitBase> hits = new ArrayList<HitBase>();
for (HitBase h : resp.getHits())
hits.add((HitBase) h);
hits = removeDuplicates(hits, 0.9);
return hits;
}
public static List<HitBase> removeDuplicates(List<HitBase> hits,
double imageDupeThresh) {
StringDistanceMeasurer meas = new StringDistanceMeasurer();
List<Integer> idsToRemove = new ArrayList<Integer>();
List<HitBase> hitsDedup = new ArrayList<HitBase>();
try {
for (int i = 0; i < hits.size(); i++)
for (int j = i + 1; j < hits.size(); j++) {
String title1 = hits.get(i).getTitle();
String title2 = hits.get(j).getTitle();
if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
continue;
if (meas.measureStringDistance(title1, title2) > imageDupeThresh) {
idsToRemove.add(j); // dupes found, later list member to
// be deleted
}
}
for (int i = 0; i < hits.size(); i++)
if (!idsToRemove.contains(i))
hitsDedup.add(hits.get(i));
if (hitsDedup.size() < hits.size()) {
LOG.info("Removed duplicates from relevant search results, including "
+ hits.get(idsToRemove.get(0)).getTitle());
}
} catch (Exception e) {
LOG.severe("Problem removing duplicates from relevant images");
}
return hitsDedup;
}
public int getTotalPagesAtASite(String site) {
BingResponse resp = null;
try {
List<String> resultList = search("site:" + site, "", "", 10);
resp = populateBingHit(resultList.get(0));
} catch (Exception e) {
// e.printStackTrace();
LOG.info("No news search results for query = 'site:" + site);
return 0;
}
return resp.totalHits;
}
public static void main(String[] args) {
int res = new BingWebQueryRunner().getTotalPagesAtASite("www.zvents.com");
new BingWebQueryRunner().runSearch("site:www.tripadvisor.com", 10);
};
}