opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java - opennlp-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.tools.similarity.apps.taxo_builder;

 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.logging.Logger;

 import opennlp.tools.similarity.apps.BingResponse;
 import opennlp.tools.similarity.apps.BingWebQueryRunner;
 import opennlp.tools.similarity.apps.HitBase;
 import opennlp.tools.similarity.apps.utils.StringCleaner;
 import opennlp.tools.stemmer.PorterStemmer;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

 /**
  * Results of taxonomy learning are two maps 0) For an entity like tax it gives
  * all lists of associated parameters obtained from the taxonomy kernel (done
  * manually) Now, given 0, we obtain the derived list of parameters as
  * commonalities of search results snapshots output map 1) for the entity,
  * derived list output map 2) for such manual list of words -> derived list of
  * words
  *
  *
  */

 public class TaxonomyExtenderViaMebMining extends BingWebQueryRunner {
   private static Logger LOG = Logger
       .getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxonomyExtenderSearchResultFromYahoo");
   private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
   ParserChunker2MatcherProcessor sm;

   private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<String, List<List<String>>>();
   private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>();
   private PorterStemmer ps;

   public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() {
     return assocWords_ExtendedAssocWords;
   }

   public Map<String, List<List<String>>> getLemma_ExtendedAssocWords() {
     return lemma_ExtendedAssocWords;
   }

   public void setLemma_ExtendedAssocWords(
       Map<String, List<List<String>>> lemma_ExtendedAssocWords) {
     this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords;
   }

   public TaxonomyExtenderViaMebMining() {
     try {
       sm = ParserChunker2MatcherProcessor.getInstance();
     } catch (Exception e) { // now try 'local' openNLP
       System.err.println("Problem loading synt matcher");

     }
     ps = new PorterStemmer();

   }

   private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
       List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove,
       List<String> toAddAtEnd) {
     List<List<String>> res = new ArrayList<List<String>>();
     for (List<ParseTreeChunk> chunks : matchList) {
       List<String> wordRes = new ArrayList<String>();
       for (ParseTreeChunk ch : chunks) {
         List<String> lemmas = ch.getLemmas();
         for (int w = 0; w < lemmas.size(); w++)
           if ((!lemmas.get(w).equals("*"))
               && ((ch.getPOSs().get(w).startsWith("NN") || ch.getPOSs().get(w)
                   .startsWith("VB"))) && lemmas.get(w).length() > 2) {
             String formedWord = lemmas.get(w);
             String stemmedFormedWord = ps.stem(formedWord);
             if (!stemmedFormedWord.startsWith("invalid"))
               wordRes.add(formedWord);
           }
       }
       wordRes = new ArrayList<String>(new HashSet<String>(wordRes));
       wordRes.removeAll(queryWordsToRemove);
       if (wordRes.size() > 0) {
         wordRes.addAll(toAddAtEnd);
         res.add(wordRes);
       }
     }
     res = new ArrayList<List<String>>(new HashSet<List<String>>(res));
     return res;
   }

   public void extendTaxonomy(String fileName, String domain, String lang) {
     AriAdapter ad = new AriAdapter();
     ad.getChainsFromARIfile(fileName);
     List<String> entries = new ArrayList<String>((ad.lemma_AssocWords.keySet()));
     try {
       for (String entity : entries) { // .
         List<List<String>> paths = ad.lemma_AssocWords.get(entity);
         for (List<String> taxoPath : paths) {
           String query = taxoPath.toString() + " " + entity + " " + domain; // todo:
                                                                             // query
                                                                             // forming
                                                                             // function
                                                                             // here
           query = query.replace('[', ' ').replace(']', ' ').replace(',', ' ')
               .replace('_', ' ');
           List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(
               query, "", lang, 30);
           List<String> toRemoveFromExtension = new ArrayList<String>(taxoPath);
           toRemoveFromExtension.add(entity);
           toRemoveFromExtension.add(domain);
           List<List<String>> resList = getCommonWordsFromList_List_ParseTreeChunk(
               matchList, toRemoveFromExtension, taxoPath);
           assocWords_ExtendedAssocWords.put(taxoPath, resList);
           resList.add(taxoPath);
           lemma_ExtendedAssocWords.put(entity, resList);
         }
       }
     } catch (Exception e) {
       System.err.println("Problem taxonomy matching");
     }

     TaxonomySerializer ser = new TaxonomySerializer(lemma_ExtendedAssocWords,
         assocWords_ExtendedAssocWords);
     ser.writeTaxonomy(fileName.replace(".ari", "Taxo.dat"));
   }

   public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
       String domain, String lang, int numbOfHits) {
     List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>();
     try {
       List<String> resultList = search(query, domain, lang, numbOfHits);

       BingResponse resp = populateBingHit(resultList.get(0));
       // printSearchResult(resultList.get(0));
       for (int i = 0; i < resp.getHits().size(); i++) {
         {
           for (int j = i + 1; j < resp.getHits().size(); j++) {
             HitBase h1 = resp.getHits().get(i);
             HitBase h2 = resp.getHits().get(j);
             String snapshot1 = StringCleaner.processSnapshotForMatching(h1
                 .getTitle() + " . " + h1.getAbstractText());
             String snapshot2 = StringCleaner.processSnapshotForMatching(h2
                 .getTitle() + " . " + h2.getAbstractText());
             SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1,
                 snapshot2);
             List<List<ParseTreeChunk>> matchResult = matchRes.getMatchResult();
             genResult.addAll(matchResult);
           }
         }
       }

     } catch (Exception e) {
       System.err.print("Problem extracting taxonomy node");
     }

     return genResult;
   }

   public void close() {
     sm.close();

   }

   public static void main(String[] args) {
     TaxonomyExtenderViaMebMining self = new TaxonomyExtenderViaMebMining();
     self.extendTaxonomy("src/test/resources/taxonomies/irs_dom.ari", "tax",
         "en");

   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.tools.similarity.apps.taxo_builder;

	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.logging.Logger;

	import opennlp.tools.similarity.apps.BingResponse;
	import opennlp.tools.similarity.apps.BingWebQueryRunner;
	import opennlp.tools.similarity.apps.HitBase;
	import opennlp.tools.similarity.apps.utils.StringCleaner;
	import opennlp.tools.stemmer.PorterStemmer;
	import opennlp.tools.textsimilarity.ParseTreeChunk;
	import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
	import opennlp.tools.textsimilarity.SentencePairMatchResult;
	import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

	/**
	* Results of taxonomy learning are two maps 0) For an entity like tax it gives
	* all lists of associated parameters obtained from the taxonomy kernel (done
	* manually) Now, given 0, we obtain the derived list of parameters as
	* commonalities of search results snapshots output map 1) for the entity,
	* derived list output map 2) for such manual list of words -> derived list of
	* words
	*
	*
	*/

	public class TaxonomyExtenderViaMebMining extends BingWebQueryRunner {
	private static Logger LOG = Logger
	.getLogger("opennlp.tools.similarity.apps.taxo_builder.TaxonomyExtenderSearchResultFromYahoo");
	private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
	ParserChunker2MatcherProcessor sm;

	private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<String, List<List<String>>>();
	private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>();
	private PorterStemmer ps;

	public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() {
	return assocWords_ExtendedAssocWords;
	}

	public Map<String, List<List<String>>> getLemma_ExtendedAssocWords() {
	return lemma_ExtendedAssocWords;
	}

	public void setLemma_ExtendedAssocWords(
	Map<String, List<List<String>>> lemma_ExtendedAssocWords) {
	this.lemma_ExtendedAssocWords = lemma_ExtendedAssocWords;
	}

	public TaxonomyExtenderViaMebMining() {
	try {
	sm = ParserChunker2MatcherProcessor.getInstance();
	} catch (Exception e) { // now try 'local' openNLP
	System.err.println("Problem loading synt matcher");

	}
	ps = new PorterStemmer();

	}

	private List<List<String>> getCommonWordsFromList_List_ParseTreeChunk(
	List<List<ParseTreeChunk>> matchList, List<String> queryWordsToRemove,
	List<String> toAddAtEnd) {
	List<List<String>> res = new ArrayList<List<String>>();
	for (List<ParseTreeChunk> chunks : matchList) {
	List<String> wordRes = new ArrayList<String>();
	for (ParseTreeChunk ch : chunks) {
	List<String> lemmas = ch.getLemmas();
	for (int w = 0; w < lemmas.size(); w++)
	if ((!lemmas.get(w).equals("*"))
	&& ((ch.getPOSs().get(w).startsWith("NN") \|\| ch.getPOSs().get(w)
	.startsWith("VB"))) && lemmas.get(w).length() > 2) {
	String formedWord = lemmas.get(w);
	String stemmedFormedWord = ps.stem(formedWord);
	if (!stemmedFormedWord.startsWith("invalid"))
	wordRes.add(formedWord);
	}
	}
	wordRes = new ArrayList<String>(new HashSet<String>(wordRes));
	wordRes.removeAll(queryWordsToRemove);
	if (wordRes.size() > 0) {
	wordRes.addAll(toAddAtEnd);
	res.add(wordRes);
	}
	}
	res = new ArrayList<List<String>>(new HashSet<List<String>>(res));
	return res;
	}

	public void extendTaxonomy(String fileName, String domain, String lang) {
	AriAdapter ad = new AriAdapter();
	ad.getChainsFromARIfile(fileName);
	List<String> entries = new ArrayList<String>((ad.lemma_AssocWords.keySet()));
	try {
	for (String entity : entries) { // .
	List<List<String>> paths = ad.lemma_AssocWords.get(entity);
	for (List<String> taxoPath : paths) {
	String query = taxoPath.toString() + " " + entity + " " + domain; // todo:
	// query
	// forming
	// function
	// here
	query = query.replace('[', ' ').replace(']', ' ').replace(',', ' ')
	.replace('_', ' ');
	List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(
	query, "", lang, 30);
	List<String> toRemoveFromExtension = new ArrayList<String>(taxoPath);
	toRemoveFromExtension.add(entity);
	toRemoveFromExtension.add(domain);
	List<List<String>> resList = getCommonWordsFromList_List_ParseTreeChunk(
	matchList, toRemoveFromExtension, taxoPath);
	assocWords_ExtendedAssocWords.put(taxoPath, resList);
	resList.add(taxoPath);
	lemma_ExtendedAssocWords.put(entity, resList);
	}
	}
	} catch (Exception e) {
	System.err.println("Problem taxonomy matching");
	}

	TaxonomySerializer ser = new TaxonomySerializer(lemma_ExtendedAssocWords,
	assocWords_ExtendedAssocWords);
	ser.writeTaxonomy(fileName.replace(".ari", "Taxo.dat"));
	}

	public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
	String domain, String lang, int numbOfHits) {
	List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>();
	try {
	List<String> resultList = search(query, domain, lang, numbOfHits);

	BingResponse resp = populateBingHit(resultList.get(0));
	// printSearchResult(resultList.get(0));
	for (int i = 0; i < resp.getHits().size(); i++) {
	{
	for (int j = i + 1; j < resp.getHits().size(); j++) {
	HitBase h1 = resp.getHits().get(i);
	HitBase h2 = resp.getHits().get(j);
	String snapshot1 = StringCleaner.processSnapshotForMatching(h1
	.getTitle() + " . " + h1.getAbstractText());
	String snapshot2 = StringCleaner.processSnapshotForMatching(h2
	.getTitle() + " . " + h2.getAbstractText());
	SentencePairMatchResult matchRes = sm.assessRelevance(snapshot1,
	snapshot2);
	List<List<ParseTreeChunk>> matchResult = matchRes.getMatchResult();
	genResult.addAll(matchResult);
	}
	}
	}

	} catch (Exception e) {
	System.err.print("Problem extracting taxonomy node");
	}

	return genResult;
	}

	public void close() {
	sm.close();

	}

	public static void main(String[] args) {
	TaxonomyExtenderViaMebMining self = new TaxonomyExtenderViaMebMining();
	self.extendTaxonomy("src/test/resources/taxonomies/irs_dom.ari", "tax",
	"en");

	}

	}