/* | |
* To change this template, choose Tools | Templates | |
* and open the template in the editor. | |
*/ | |
package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker; | |
import java.io.BufferedOutputStream; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.FileNotFoundException; | |
import java.io.FileOutputStream; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.OutputStream; | |
import java.util.ArrayList; | |
import java.util.Collection; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Set; | |
import java.util.logging.Level; | |
import java.util.logging.Logger; | |
import opennlp.tools.doccat.BagOfWordsFeatureGenerator; | |
import opennlp.tools.doccat.DoccatModel; | |
import opennlp.tools.doccat.DocumentCategorizerME; | |
import opennlp.tools.doccat.DocumentSample; | |
import opennlp.tools.doccat.DocumentSampleStream; | |
import opennlp.tools.entitylinker.EntityLinkerProperties; | |
import opennlp.tools.entitylinker.domain.BaseLink; | |
import opennlp.tools.entitylinker.domain.LinkedSpan; | |
import opennlp.tools.util.ObjectStream; | |
import opennlp.tools.util.PlainTextByLineStream; | |
import opennlp.tools.util.Span; | |
/** | |
* | |
* @author Owner | |
*/ | |
public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> { | |
public static ModelBasedScorer scorer; | |
static { | |
scorer = new ModelBasedScorer(); | |
} | |
DocumentCategorizerME documentCategorizerME; | |
DoccatModel doccatModel; | |
public static final int RADIUS = 100; | |
@Override | |
public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) { | |
try { | |
if (doccatModel == null) { | |
String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", ""); | |
if (path.equals("")) { | |
return; | |
} | |
doccatModel = new DoccatModel(new File(path)); | |
documentCategorizerME = new DocumentCategorizerME(doccatModel); | |
} | |
Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS); | |
for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) { | |
Map<String, Double> scores = this.getScore(entry.getValue()); | |
for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) { | |
double score = 0d; | |
if (scores.containsKey(link.getItemParentID())) { | |
score = scores.get(link.getItemParentID()); | |
} | |
link.getScoreMap().put("countrymodel", score); | |
} | |
} | |
} catch (FileNotFoundException ex) { | |
System.err.println("could not find modelpath using EntityLinkerProperties. Property should be \"opennlp.geoentitylinker.modelbasedscorer.modelpath\""); | |
} catch (IOException ex) { | |
System.err.println(ex); | |
} catch (Exception ex) { | |
Logger.getLogger(ModelBasedScorer.class.getName()).log(Level.SEVERE, null, ex); | |
} | |
} | |
/** | |
* generates features using a BagOfWordsfeatureGenerator that are within the | |
* radius of a mention within the doctext | |
* | |
* @param linkedSpans | |
* @param docText | |
* @param additionalContext | |
* @param radius | |
* @return a map of the index of the linked span to the string of surrounding | |
* text: Map<indexofspan,surrounding text> | |
*/ | |
public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) { | |
Map<Integer, String> featureBags = new HashMap<>(); | |
Map<Integer, Integer> nameMentionMap = new HashMap<>(); | |
/** | |
* iterator over the map that contains a mapping of every country code to | |
* all of its mentions in the document | |
*/ | |
for (int i = 0; i < linkedSpans.size(); i++) { | |
LinkedSpan span = linkedSpans.get(i); | |
if (span.getLinkedEntries().isEmpty()) { | |
//don't care about spans that did not get linked to anything at all; nothing to work with | |
continue; | |
} | |
/** | |
* get the sentence the name span was found in, the beginning of the | |
* sentence will suffice as a centroid for feature generation around the | |
* named entity | |
*/ | |
Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart(); | |
nameMentionMap.put(i, mentionIdx); | |
} | |
/** | |
* now associate each span to a string that will be used for categorization | |
* against the model. | |
*/ | |
for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) { | |
featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius)); | |
} | |
return featureBags; | |
} | |
private String getTextChunk(int mentionIdx, String docText, int radius) { | |
int docSize = docText.length(); | |
int left = 0, right = 0; | |
left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius; | |
right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius; | |
String chunk = ""; | |
if (right <= left) { | |
chunk = ""; | |
} else { | |
/** | |
* don't want to chop any words in half, so take fron the first space to | |
* the last space in the chunk string | |
*/ | |
chunk = docText.substring(left, right); | |
if (left != 0) { | |
left = chunk.indexOf(" "); | |
} | |
right = chunk.lastIndexOf(" "); | |
/** | |
* now get the substring again with only whole words | |
*/ | |
if (left < right) { | |
chunk = chunk.substring(left, right); | |
} | |
} | |
return chunk; | |
} | |
private Map<String, Double> getScore(String text) throws Exception { | |
Map<String, Double> scoreMap = new HashMap<>(); | |
if (documentCategorizerME == null) { | |
documentCategorizerME = new DocumentCategorizerME(new DoccatModel(new File(""))); | |
} | |
double[] categorize = documentCategorizerME.categorize(text); | |
int catSize = documentCategorizerME.getNumberOfCategories(); | |
for (int i = 0; i < catSize; i++) { | |
String category = documentCategorizerME.getCategory(i); | |
scoreMap.put(category, categorize[documentCategorizerME.getIndex(category)]); | |
} | |
return scoreMap; | |
} | |
/** | |
* | |
* @param documents A list of document texts, for best results try to | |
* ensure each country you care about will be | |
* represented by the collection | |
* @param annotationOutFile the location where the annotated doccat text file | |
* will be stored | |
* @param modelOutFile the location where the doccat model will be stored | |
* @param properties the properties where the country context object | |
* will find it's country data from this property: | |
* opennlp.geoentitylinker.countrycontext.filepath | |
* @throws IOException | |
*/ | |
public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException { | |
CountryContext context = new CountryContext(); | |
FileWriter writer = new FileWriter(annotationOutFile, true); | |
for (String docText : documents) { | |
Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties); | |
Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS); | |
for (String key : modelCountryContext.keySet()) { | |
for (String wordbag : modelCountryContext.get(key)) { | |
writer.write(key + " " + wordbag + "\n"); | |
} | |
} | |
} | |
writer.close(); | |
DoccatModel model = null; | |
InputStream dataIn = new FileInputStream(annotationOutFile); | |
try { | |
ObjectStream<String> lineStream = | |
new PlainTextByLineStream(dataIn, "UTF-8"); | |
ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream); | |
model = DocumentCategorizerME.train("en", sampleStream); | |
OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile)); | |
model.serialize(modelOut); | |
} catch (IOException e) { | |
// Failed to read or parse training data, training failed | |
e.printStackTrace(); | |
} | |
} | |
/** | |
* generates proximal wordbags within the radius of a country mention within | |
* the doctext based on the country context object | |
* | |
* | |
* @param docText | |
* @param additionalContext | |
* @param radius | |
* @return | |
*/ | |
public static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) { | |
Map<String, ArrayList< String>> featureBags = new HashMap<>(); | |
Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions(); | |
/** | |
* iterator over the map that contains a mapping of every country code to | |
* all of its mentions in the document | |
*/ | |
for (String code : countryMentions.keySet()) { | |
/** | |
* for each mention, collect features from around each mention, then | |
* consolidate the features into another map | |
*/ | |
for (int mentionIdx : countryMentions.get(code)) { | |
String chunk = scorer.getTextChunk(mentionIdx, docText, radius); | |
// Collection<String> extractFeatures = super.extractFeatures(chunk.split(" ")); | |
if (featureBags.containsKey(code)) { | |
featureBags.get(code).add(chunk); | |
} else { | |
ArrayList<String> newlist = new ArrayList<>(); | |
newlist.add(chunk); | |
featureBags.put(code, newlist); | |
} | |
} | |
} | |
return featureBags; | |
} | |
} |