/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package opennlp.tools.similarity.apps.taxo_builder; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.HashSet; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.logging.Logger; | |
import org.apache.commons.lang.StringUtils; | |
import opennlp.tools.parse_thicket.matching.Matcher; | |
import opennlp.tools.similarity.apps.BingQueryRunner; | |
import opennlp.tools.similarity.apps.HitBase; | |
import opennlp.tools.similarity.apps.utils.StringCleaner; | |
import opennlp.tools.stemmer.PStemmer; | |
import opennlp.tools.textsimilarity.ParseTreeChunk; | |
import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; | |
import opennlp.tools.textsimilarity.SentencePairMatchResult; | |
import opennlp.tools.textsimilarity.TextProcessor; | |
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; | |
/** | |
* | |
*/ | |
public class DomainTaxonomyExtender { | |
private static Logger LOG = Logger | |
.getLogger("opennlp.tools.similarity.apps.taxo_builder.DomainTaxonomyExtender"); | |
private BingQueryRunner brunner = new BingQueryRunner(); | |
protected static String BING_KEY = "WFoNMM706MMJ5JYfcHaSEDP+faHj3xAxt28CPljUAHA"; | |
Matcher matcher = new Matcher(); | |
private final static String TAXO_FILENAME = "taxo_data.dat"; | |
private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<String, List<List<String>>>(); | |
private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>(); | |
private PStemmer ps; | |
CsvAdapter adapter = new CsvAdapter(); | |
public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() { | |
return assocWords_ExtendedAssocWords; | |
} | |
public Map<String, List<List<String>>> getLemma_ExtendedAssocWords() { | |
return lemma_ExtendedAssocWords; | |
} | |
public void setLemma_ExtendedAssocWords( | |
Map<String, List<List<String>>> lemma_ExtendedAssocWords) { | |
this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords; | |
} | |
public DomainTaxonomyExtender() { | |
ps = new PStemmer(); | |
adapter.importCSV(); | |
brunner.setKey(BING_KEY); | |
} | |
private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk( | |
List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove, | |
List<String> toAddAtEnd) { | |
List<List<String>> res = new ArrayList<List<String>>(); | |
for (List<ParseTreeChunk> chunks : matchList) { | |
List<String> wordRes = new ArrayList<String>(); | |
for (ParseTreeChunk ch : chunks) { | |
List<String> lemmas = ch.getLemmas(); | |
for (int w = 0; w < lemmas.size(); w++) | |
if ((!lemmas.get(w).equals("*")) | |
&& ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w).startsWith("JJ") || ch.getPOSs().get(w) | |
.startsWith("VB"))) && lemmas.get(w).length() > 2) { | |
String formedWord = lemmas.get(w); | |
String stemmedFormedWord = ps.stem(formedWord); | |
if (!stemmedFormedWord.startsWith("invalid")) | |
wordRes.add(formedWord); | |
} | |
} | |
wordRes = new ArrayList<String>(new HashSet<String>(wordRes)); | |
List<String> cleanedRes = new ArrayList<String>(); | |
for(String s: wordRes){ | |
if (!queryWordsToRemove.contains(s)) | |
cleanedRes .add(s); | |
} | |
//wordRes.removeAll(queryWordsToRemove); | |
if (cleanedRes.size() > 0) { | |
//cleanedRes.addAll(toAddAtEnd); | |
res.add(cleanedRes); | |
} | |
} | |
res = new ArrayList<List<String>>(new HashSet<List<String>>(res)); | |
return res; | |
} | |
public void extendTaxonomy(String fileNameWithTaxonomyRoot, String domain, String lang) { | |
List<String> entries = new ArrayList<String>((adapter.lemma_AssocWords.keySet())); | |
try { | |
for (String entity : entries) { // . | |
List<List<String>> paths = adapter.lemma_AssocWords.get(entity); | |
for (List<String> taxoPath : paths) { | |
String query = taxoPath.toString() + " " + entity + " " + domain; | |
query = query.replace('[', ' ').replace(']', ' ').replace(',', ' ') | |
.replace('_', ' '); | |
List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath( | |
query, "", lang, 20); //30 | |
List<String> toRemoveFromExtension = new ArrayList<String>(taxoPath); | |
toRemoveFromExtension.add(entity); | |
toRemoveFromExtension.add(domain); | |
List<List<String>> resList = getCommonWordsFromList_List_ParseTreeChunk( | |
matchList, toRemoveFromExtension, taxoPath); | |
assocWords_ExtendedAssocWords.put(taxoPath, resList); | |
resList.add(taxoPath); | |
lemma_ExtendedAssocWords.put(entity, resList); | |
TaxonomySerializer ser = new TaxonomySerializer(lemma_ExtendedAssocWords, | |
assocWords_ExtendedAssocWords); | |
ser.writeTaxonomy(TAXO_FILENAME); | |
} | |
} | |
} catch (Exception e) { | |
e.printStackTrace(); | |
System.err.println("Problem taxonomy matching"); | |
} | |
} | |
public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query, | |
String domain, String lang, int numbOfHits) { | |
List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>(); | |
try { | |
List<HitBase> resultList = brunner.runSearch(query, numbOfHits); | |
for (int i = 0; i < resultList.size(); i++) { | |
{ | |
for (int j = i + 1; j < resultList.size(); j++) { | |
HitBase h1 = resultList.get(i); | |
HitBase h2 = resultList.get(j); | |
String snapshot1 = StringCleaner.processSnapshotForMatching(h1 | |
.getTitle() + " " + h1.getAbstractText()); | |
String snapshot2 = StringCleaner.processSnapshotForMatching(h2 | |
.getTitle() + " " + h2.getAbstractText()); | |
List<List<ParseTreeChunk>> overlaps =matcher.assessRelevance(snapshot1, snapshot2); | |
genResult.addAll(overlaps); | |
} | |
} | |
} | |
} catch (Exception e) { | |
e.printStackTrace(); | |
System.err.print("Problem searching for "+query); | |
} | |
return genResult; | |
} | |
public List<String> runSearchForTaxonomyPathFlatten(String query, | |
String domain, String lang, int numbOfHits) { | |
List<String> genResult = new ArrayList<String>(); | |
try { | |
List<HitBase> resultList = brunner.runSearch(query, numbOfHits); | |
for (int i = 0; i < resultList.size(); i++) { | |
{ | |
for (int j = i + 1; j < resultList.size(); j++) { | |
HitBase h1 = resultList.get(i); | |
HitBase h2 = resultList.get(j); | |
String snapshot1 = StringCleaner.processSnapshotForMatching(h1 | |
.getTitle() + " " + h1.getAbstractText()); | |
String snapshot2 = StringCleaner.processSnapshotForMatching(h2 | |
.getTitle() + " " + h2.getAbstractText()); | |
List<String> overlaps =assessKeywordOverlap(snapshot1, snapshot2); | |
genResult.addAll(overlaps); | |
} | |
} | |
} | |
} catch (Exception e) { | |
System.err.print("Problem searching for "+query); | |
} | |
return genResult; | |
} | |
private List<String> assessKeywordOverlap(String snapshot1, String snapshot2) { | |
List<String> results = new ArrayList<String>(); | |
List<String> firstList = TextProcessor.fastTokenize(snapshot1, false), | |
secondList = TextProcessor.fastTokenize(snapshot2, false); | |
firstList.retainAll(secondList); | |
for(String s: firstList){ | |
if (s.length()<4) | |
continue; | |
if (!StringUtils.isAlpha(s)) | |
continue; | |
results.add(s); | |
} | |
return results; | |
} | |
public static void main(String[] args) { | |
DomainTaxonomyExtender self = new DomainTaxonomyExtender(); | |
self.extendTaxonomy("", "music", | |
"en"); | |
} | |
} |