geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java - opennlp-addons - Git at Google

 /*
  * Copyright 2013 The Apache Software Foundation.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package opennlp.addons.geoentitylinker;

 import java.io.BufferedOutputStream;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
 import opennlp.tools.doccat.DoccatModel;
 import opennlp.tools.doccat.DocumentCategorizerME;
 import opennlp.tools.doccat.DocumentSample;
 import opennlp.tools.doccat.DocumentSampleStream;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;

 import opennlp.tools.cmdline.MarkableFileInputStreamFactory;


 /**
  *
  * Tools for setting up GeoEntityLinker gazateers and doccat scoring model
  */
 public class GeoEntityLinkerSetupUtils {
   private static final int RADIUS = 200;
   public static ModelBasedScorer scorer;

   static {
     scorer = new ModelBasedScorer();
   }

   /**
    * Generates the lucene indexes of the USGS and GEONAMES gazateers.
    *
    * @param outputIndexDir    the destination directory of the index. Must be a
    *                          directory
    * @param gazateerInputData the input data file. Must be in geonames gaz
    *                          format, or USGS format
    * @param type              the type, USGS, or GEONAMES
    */
   public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type) {
     GazateerIndexer indexer = new GazateerIndexer();
     try {
       indexer.index(outputIndexDir, gazateerInputData, type);
     } catch (Exception ex) {
       ex.printStackTrace();
     }
   }

   /**
    * Generates a doccat model from proximal features generated from surrounding
    * context of country mentions. This model is used as a basis for a score
    * called coutrymodel, which takes the context from around a toponym, and uses
    * this model to return a score for the country code of the toponym hit in the
    * gazateer.
    *
    * @param documents         A list of document texts, for best results try to
    *                          ensure each country you care about will be well
    *                          represented in the collection
    * @param annotationOutFile the location where the annotated doccat text file
    *                          will be stored
    * @param modelOutFile      the location where the doccat model will be stored
    * @param properties        the properties where the country context object
    *                          will find it's country data from this property:
    *                          opennlp.geoentitylinker.countrycontext.filepath
    * @throws IOException
    */
   public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws Exception {
     CountryContext context = new CountryContext(properties);
     FileWriter writer = new FileWriter(annotationOutFile, true);
     System.out.println("processing " + documents.size() + " documents");
     for (String docText : documents) {
       System.out.append(".");
       Map<String, Set<Integer>> regexfind = context.regexfind(docText);
       Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);
       for (String key : modelCountryContext.keySet()) {
         for (String wordbag : modelCountryContext.get(key)) {
           writer.write(key + " " + wordbag + "\n");
         }
       }
     }
     System.out.println("Document processing complete. Writing training data to " + annotationOutFile.getAbsolutePath());
     writer.close();
     System.out.println("Building Doccat model...");
     DoccatModel model = null;

    // InputStream dataIn = new FileInputStream(annotationOutFile);
     try {


       ObjectStream<String> lineStream = new PlainTextByLineStream(new MarkableFileInputStreamFactory(annotationOutFile), "UTF-8");
       ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);

       model = DocumentCategorizerME.train("en", sampleStream);
       OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));
       model.serialize(modelOut);
       System.out.println("Model complete!");
     } catch (IOException e) {
       // Failed to read or parse training data, training failed
       e.printStackTrace();
     }

   }

   /**
    * generates proximal wordbags within the radius of a country mention within
    * the doctext based on the country context object
    *
    *
    * @param docText
    * @param additionalContext
    * @param radius
    * @return
    */
   private static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {
     Map<String, ArrayList< String>> featureBags = new HashMap<>();
     Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();
     /**
      * iterator over the map that contains a mapping of every country code to
      * all of its mentions in the document
      */
     for (String code : countryMentions.keySet()) {
       /**
        * for each mention, collect features from around each mention, then
        * consolidate the features into another map
        */
       for (int mentionIdx : countryMentions.get(code)) {
         String chunk = scorer.getTextChunk(mentionIdx, docText, radius);
         //   Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));
         if (featureBags.containsKey(code)) {
           featureBags.get(code).add(chunk);
         } else {
           ArrayList<String> newlist = new ArrayList<>();
           newlist.add(chunk);
           featureBags.put(code, newlist);
         }
       }
     }
     return featureBags;
   }
 }
	/*
	* Copyright 2013 The Apache Software Foundation.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package opennlp.addons.geoentitylinker;

	import java.io.BufferedOutputStream;
	import java.io.File;
	import java.io.FileOutputStream;
	import java.io.FileWriter;
	import java.io.IOException;
	import java.io.OutputStream;
	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.HashMap;
	import java.util.Map;
	import java.util.Set;
	import opennlp.tools.doccat.DoccatModel;
	import opennlp.tools.doccat.DocumentCategorizerME;
	import opennlp.tools.doccat.DocumentSample;
	import opennlp.tools.doccat.DocumentSampleStream;
	import opennlp.tools.entitylinker.EntityLinkerProperties;
	import opennlp.tools.util.ObjectStream;
	import opennlp.tools.util.PlainTextByLineStream;

	import opennlp.tools.cmdline.MarkableFileInputStreamFactory;


	/**
	*
	* Tools for setting up GeoEntityLinker gazateers and doccat scoring model
	*/
	public class GeoEntityLinkerSetupUtils {
	private static final int RADIUS = 200;
	public static ModelBasedScorer scorer;

	static {
	scorer = new ModelBasedScorer();
	}

	/**
	* Generates the lucene indexes of the USGS and GEONAMES gazateers.
	*
	* @param outputIndexDir the destination directory of the index. Must be a
	* directory
	* @param gazateerInputData the input data file. Must be in geonames gaz
	* format, or USGS format
	* @param type the type, USGS, or GEONAMES
	*/
	public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type) {
	GazateerIndexer indexer = new GazateerIndexer();
	try {
	indexer.index(outputIndexDir, gazateerInputData, type);
	} catch (Exception ex) {
	ex.printStackTrace();
	}
	}

	/**
	* Generates a doccat model from proximal features generated from surrounding
	* context of country mentions. This model is used as a basis for a score
	* called coutrymodel, which takes the context from around a toponym, and uses
	* this model to return a score for the country code of the toponym hit in the
	* gazateer.
	*
	* @param documents A list of document texts, for best results try to
	* ensure each country you care about will be well
	* represented in the collection
	* @param annotationOutFile the location where the annotated doccat text file
	* will be stored
	* @param modelOutFile the location where the doccat model will be stored
	* @param properties the properties where the country context object
	* will find it's country data from this property:
	* opennlp.geoentitylinker.countrycontext.filepath
	* @throws IOException
	*/
	public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws Exception {
	CountryContext context = new CountryContext(properties);
	FileWriter writer = new FileWriter(annotationOutFile, true);
	System.out.println("processing " + documents.size() + " documents");
	for (String docText : documents) {
	System.out.append(".");
	Map<String, Set<Integer>> regexfind = context.regexfind(docText);
	Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);
	for (String key : modelCountryContext.keySet()) {
	for (String wordbag : modelCountryContext.get(key)) {
	writer.write(key + " " + wordbag + "\n");
	}
	}
	}
	System.out.println("Document processing complete. Writing training data to " + annotationOutFile.getAbsolutePath());
	writer.close();
	System.out.println("Building Doccat model...");
	DoccatModel model = null;

	// InputStream dataIn = new FileInputStream(annotationOutFile);
	try {


	ObjectStream<String> lineStream = new PlainTextByLineStream(new MarkableFileInputStreamFactory(annotationOutFile), "UTF-8");
	ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);

	model = DocumentCategorizerME.train("en", sampleStream);
	OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));
	model.serialize(modelOut);
	System.out.println("Model complete!");
	} catch (IOException e) {
	// Failed to read or parse training data, training failed
	e.printStackTrace();
	}

	}

	/**
	* generates proximal wordbags within the radius of a country mention within
	* the doctext based on the country context object
	*
	*
	* @param docText
	* @param additionalContext
	* @param radius
	* @return
	*/
	private static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {
	Map<String, ArrayList< String>> featureBags = new HashMap<>();
	Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();
	/**
	* iterator over the map that contains a mapping of every country code to
	* all of its mentions in the document
	*/
	for (String code : countryMentions.keySet()) {
	/**
	* for each mention, collect features from around each mention, then
	* consolidate the features into another map
	*/
	for (int mentionIdx : countryMentions.get(code)) {
	String chunk = scorer.getTextChunk(mentionIdx, docText, radius);
	// Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));
	if (featureBags.containsKey(code)) {
	featureBags.get(code).add(chunk);
	} else {
	ArrayList<String> newlist = new ArrayList<>();
	newlist.add(chunk);
	featureBags.put(code, newlist);
	}
	}
	}
	return featureBags;
	}
	}