blob: a60c838fb3bf67c6a6fae7bee0117bcb39d88c1a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.lang.english;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import opennlp.tools.namefind.NameFinderEventStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.parser.Parse;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.util.Span;
/**
* Class is used to create a name finder for English.
*
* @deprecated will be removed soon!
*/
@Deprecated
public class TreebankNameFinder {
public static String[] NAME_TYPES = {"person", "organization", "location", "date", "time", "percentage", "money"};
private NameFinderME nameFinder;
/** Creates an English name finder using the specified model.
* @param mod The model used for finding names.
*/
public TreebankNameFinder(TokenNameFinderModel mod) {
nameFinder = new NameFinderME(mod);
}
private static void clearPrevTokenMaps(TreebankNameFinder[] finders) {
for (int mi = 0; mi < finders.length; mi++) {
finders[mi].nameFinder.clearAdaptiveData();
}
}
private static void processParse(TreebankNameFinder[] finders, String[] tags, BufferedReader input) throws IOException {
Span[][] nameSpans = new Span[finders.length][];
for (String line = input.readLine(); null != line; line = input.readLine()) {
if (line.equals("")) {
System.out.println();
clearPrevTokenMaps(finders);
continue;
}
Parse p = Parse.parseParse(line);
Parse[] tagNodes = p.getTagNodes();
String[] tokens = new String[tagNodes.length];
for (int ti=0;ti<tagNodes.length;ti++){
tokens[ti] = tagNodes[ti].getCoveredText();
}
//System.err.println(java.util.Arrays.asList(tokens));
for (int fi = 0, fl = finders.length; fi < fl; fi++) {
nameSpans[fi] = finders[fi].nameFinder.find(tokens);
//System.err.println("english.NameFinder.processParse: "+tags[fi] + " " + java.util.Arrays.asList(nameSpans[fi]));
}
for (int fi = 0, fl = finders.length; fi < fl; fi++) {
Parse.addNames(tags[fi],nameSpans[fi],tagNodes);
}
p.show();
}
}
/**
* Adds sgml style name tags to the specified input buffer and outputs this information to stdout.
* @param finders The name finders to be used.
* @param tags The tag names for the corresponding name finder.
* @param input The input reader.
* @throws IOException
*/
private static void processText(TreebankNameFinder[] finders, String[] tags, BufferedReader input) throws IOException {
Span[][] nameSpans = new Span[finders.length][];
String[][] nameOutcomes = new String[finders.length][];
opennlp.tools.tokenize.Tokenizer tokenizer = new SimpleTokenizer();
StringBuffer output = new StringBuffer();
for (String line = input.readLine(); null != line; line = input.readLine()) {
if (line.equals("")) {
clearPrevTokenMaps(finders);
System.out.println();
continue;
}
output.setLength(0);
Span[] spans = tokenizer.tokenizePos(line);
String[] tokens = Span.spansToStrings(spans,line);
for (int fi = 0, fl = finders.length; fi < fl; fi++) {
nameSpans[fi] = finders[fi].nameFinder.find(tokens);
//System.err.println("EnglighNameFinder.processText: "+tags[fi] + " " + java.util.Arrays.asList(finderTags[fi]));
nameOutcomes[fi] = NameFinderEventStream.generateOutcomes(nameSpans[fi], null, tokens.length);
}
for (int ti = 0, tl = tokens.length; ti < tl; ti++) {
for (int fi = 0, fl = finders.length; fi < fl; fi++) {
//check for end tags
if (ti != 0) {
if ((nameOutcomes[fi][ti].equals(NameFinderME.START) || nameOutcomes[fi][ti].equals(NameFinderME.OTHER)) &&
(nameOutcomes[fi][ti - 1].equals(NameFinderME.START) || nameOutcomes[fi][ti - 1].equals(NameFinderME.CONTINUE))) {
output.append("</").append(tags[fi]).append(">");
}
}
}
if (ti > 0 && spans[ti - 1].getEnd() < spans[ti].getStart()) {
output.append(line.substring(spans[ti - 1].getEnd(), spans[ti].getStart()));
}
//check for start tags
for (int fi = 0, fl = finders.length; fi < fl; fi++) {
if (nameOutcomes[fi][ti].equals(NameFinderME.START)) {
output.append("<").append(tags[fi]).append(">");
}
}
output.append(tokens[ti]);
}
//final end tags
if (tokens.length != 0) {
for (int fi = 0, fl = finders.length; fi < fl; fi++) {
if (nameOutcomes[fi][tokens.length - 1].equals(NameFinderME.START) || nameOutcomes[fi][tokens.length - 1].equals(NameFinderME.CONTINUE)) {
output.append("</").append(tags[fi]).append(">");
}
}
}
if (tokens.length != 0) {
if (spans[tokens.length - 1].getEnd() < line.length()) {
output.append(line.substring(spans[tokens.length - 1].getEnd()));
}
}
System.out.println(output);
}
}
public static void main(String[] args) throws IOException {
if (args.length == 0) {
System.err.println("Usage NameFinder -[parse] model1 model2 ... modelN < sentences");
System.err.println(" -parse: Use this option to find names on parsed input. Un-tokenized sentence text is the default.");
System.exit(1);
}
int ai = 0;
boolean parsedInput = false;
while (args[ai].startsWith("-") && ai < args.length) {
if (args[ai].equals("-parse")) {
parsedInput = true;
}
else {
System.err.println("Ignoring unknown option "+args[ai]);
}
ai++;
}
TreebankNameFinder[] finders = new TreebankNameFinder[args.length-ai];
String[] names = new String[args.length-ai];
for (int fi=0; ai < args.length; ai++,fi++) {
String modelName = args[ai];
finders[fi] = new TreebankNameFinder(new TokenNameFinderModel(new FileInputStream(modelName)));
int nameStart = modelName.lastIndexOf(System.getProperty("file.separator")) + 1;
int nameEnd = modelName.indexOf('.', nameStart);
if (nameEnd == -1) {
nameEnd = modelName.length();
}
names[fi] = modelName.substring(nameStart, nameEnd);
}
//long t1 = System.currentTimeMillis();
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
if (parsedInput) {
processParse(finders,names,in);
}
else {
processText(finders,names,in);
}
//long t2 = System.currentTimeMillis();
//System.err.println("Time "+(t2-t1));
}
}