blob: a934264a2d247cea26570d67d377d7075d345635 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.similarity.apps;
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;
import net.billylieurance.azuresearch.AzureSearchResultSet;
import net.billylieurance.azuresearch.AzureSearchWebQuery;
import net.billylieurance.azuresearch.AzureSearchWebResult;
import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
import org.apache.commons.lang.StringUtils;
import org.json.JSONArray;
import org.json.JSONObject;
public class BingWebQueryRunner {
private static final Logger LOG = Logger
.getLogger("opennlp.tools.similarity.apps.BingWebQueryRunner");
public static final String BING_KEY = "TyfmF/4t1qbnA5X6sBXiTf80l29cSn+7IT0fPw2FNsU=";
private AzureSearchWebQuery aq = new AzureSearchWebQuery();
public List<HitBase> runSearch(String query, int nRes) {
aq.setAppid(BING_KEY);
aq.setQuery(query);
aq.doQuery();
List<HitBase> results = new ArrayList<HitBase> ();
AzureSearchResultSet<AzureSearchWebResult> ars = aq.getQueryResult();
for (AzureSearchWebResult anr : ars){
HitBase h = new HitBase();
h.setAbstractText(anr.getDescription());
h.setTitle(anr.getTitle());
h.setUrl(anr.getUrl());
results.add(h);
results = removeDuplicates(results, 0.9);
}
return results;
}
public static List<HitBase> removeDuplicates(List<HitBase> hits,
double imageDupeThresh) {
StringDistanceMeasurer meas = new StringDistanceMeasurer();
List<Integer> idsToRemove = new ArrayList<Integer>();
List<HitBase> hitsDedup = new ArrayList<HitBase>();
try {
for (int i = 0; i < hits.size(); i++)
for (int j = i + 1; j < hits.size(); j++) {
String title1 = hits.get(i).getTitle();
String title2 = hits.get(j).getTitle();
if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
continue;
if (meas.measureStringDistance(title1, title2) > imageDupeThresh) {
idsToRemove.add(j); // dupes found, later list member to
// be deleted
}
}
for (int i = 0; i < hits.size(); i++)
if (!idsToRemove.contains(i))
hitsDedup.add(hits.get(i));
if (hitsDedup.size() < hits.size()) {
LOG.info("Removed duplicates from relevant search results, including "
+ hits.get(idsToRemove.get(0)).getTitle());
}
} catch (Exception e) {
LOG.severe("Problem removing duplicates from relevant images");
}
return hitsDedup;
}
public int getTotalPagesAtASite(String site) {
try {
List<HitBase> resultList = runSearch("site:" + site, 10);
} catch (Exception e) {
// e.printStackTrace();
LOG.info("No news search results for query = 'site:" + site);
return 0;
}
return 0;
}
}