blob: 6def798109ac882a265045525544c4be44aa5ef9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.postag;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.util.Cache;
import opennlp.tools.util.StringList;
/**
* A context generator for the POS Tagger.
*/
public class DefaultPOSContextGenerator implements POSContextGenerator {
protected final String SE = "*SE*";
protected final String SB = "*SB*";
private static final int PREFIX_LENGTH = 4;
private static final int SUFFIX_LENGTH = 4;
private static Pattern hasCap = Pattern.compile("[A-Z]");
private static Pattern hasNum = Pattern.compile("[0-9]");
private Cache<String, String[]> contextsCache;
private Object wordsKey;
private Dictionary dict;
/**
* Initializes the current instance.
*
* @param dict
*/
public DefaultPOSContextGenerator(Dictionary dict) {
this(0,dict);
}
/**
* Initializes the current instance.
*
* @param cacheSize
* @param dict
*/
public DefaultPOSContextGenerator(int cacheSize, Dictionary dict) {
this.dict = dict;
if (cacheSize > 0) {
contextsCache = new Cache<>(cacheSize);
}
}
protected static String[] getPrefixes(String lex) {
String[] prefs = new String[PREFIX_LENGTH];
for (int li = 0; li < PREFIX_LENGTH; li++) {
prefs[li] = lex.substring(0, StrictMath.min(li + 1, lex.length()));
}
return prefs;
}
protected static String[] getSuffixes(String lex) {
String[] suffs = new String[SUFFIX_LENGTH];
for (int li = 0; li < SUFFIX_LENGTH; li++) {
suffs[li] = lex.substring(StrictMath.max(lex.length() - li - 1, 0));
}
return suffs;
}
public String[] getContext(int index, String[] sequence, String[] priorDecisions,
Object[] additionalContext) {
return getContext(index,sequence,priorDecisions);
}
/**
* Returns the context for making a pos tag decision at the specified token index
* given the specified tokens and previous tags.
* @param index The index of the token for which the context is provided.
* @param tokens The tokens in the sentence.
* @param tags The tags assigned to the previous words in the sentence.
* @return The context for making a pos tag decision at the specified token index
* given the specified tokens and previous tags.
*/
public String[] getContext(int index, Object[] tokens, String[] tags) {
String next, nextnext = null, lex, prev, prevprev = null;
String tagprev, tagprevprev;
tagprev = tagprevprev = null;
lex = tokens[index].toString();
if (tokens.length > index + 1) {
next = tokens[index + 1].toString();
if (tokens.length > index + 2)
nextnext = tokens[index + 2].toString();
else
nextnext = SE; // Sentence End
}
else {
next = SE; // Sentence End
}
if (index - 1 >= 0) {
prev = tokens[index - 1].toString();
tagprev = tags[index - 1];
if (index - 2 >= 0) {
prevprev = tokens[index - 2].toString();
tagprevprev = tags[index - 2];
}
else {
prevprev = SB; // Sentence Beginning
}
}
else {
prev = SB; // Sentence Beginning
}
String cacheKey = index + tagprev + tagprevprev;
if (contextsCache != null) {
if (wordsKey == tokens) {
String[] cachedContexts = contextsCache.get(cacheKey);
if (cachedContexts != null) {
return cachedContexts;
}
}
else {
contextsCache.clear();
wordsKey = tokens;
}
}
List<String> e = new ArrayList<>();
e.add("default");
// add the word itself
e.add("w=" + lex);
if (dict == null || !dict.contains(new StringList(lex))) {
// do some basic suffix analysis
String[] suffs = getSuffixes(lex);
for (int i = 0; i < suffs.length; i++) {
e.add("suf=" + suffs[i]);
}
String[] prefs = getPrefixes(lex);
for (int i = 0; i < prefs.length; i++) {
e.add("pre=" + prefs[i]);
}
// see if the word has any special characters
if (lex.indexOf('-') != -1) {
e.add("h");
}
if (hasCap.matcher(lex).find()) {
e.add("c");
}
if (hasNum.matcher(lex).find()) {
e.add("d");
}
}
// add the words and pos's of the surrounding context
if (prev != null) {
e.add("p=" + prev);
if (tagprev != null) {
e.add("t=" + tagprev);
}
if (prevprev != null) {
e.add("pp=" + prevprev);
if (tagprevprev != null) {
e.add("t2=" + tagprevprev + "," + tagprev);
}
}
}
if (next != null) {
e.add("n=" + next);
if (nextnext != null) {
e.add("nn=" + nextnext);
}
}
String[] contexts = e.toArray(new String[e.size()]);
if (contextsCache != null) {
contextsCache.put(cacheKey,contexts);
}
return contexts;
}
}