| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| |
| package opennlp.tools.postag; |
| |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.regex.Pattern; |
| |
| import opennlp.tools.dictionary.Dictionary; |
| import opennlp.tools.util.Cache; |
| import opennlp.tools.util.StringList; |
| |
| /** |
| * A context generator for the POS Tagger. |
| */ |
| public class DefaultPOSContextGenerator implements POSContextGenerator { |
| |
| protected final String SE = "*SE*"; |
| protected final String SB = "*SB*"; |
| private static final int PREFIX_LENGTH = 4; |
| private static final int SUFFIX_LENGTH = 4; |
| |
| private static Pattern hasCap = Pattern.compile("[A-Z]"); |
| private static Pattern hasNum = Pattern.compile("[0-9]"); |
| |
| private Cache<String, String[]> contextsCache; |
| private Object wordsKey; |
| |
| private Dictionary dict; |
| |
| /** |
| * Initializes the current instance. |
| * |
| * @param dict |
| */ |
| public DefaultPOSContextGenerator(Dictionary dict) { |
| this(0,dict); |
| } |
| |
| /** |
| * Initializes the current instance. |
| * |
| * @param cacheSize |
| * @param dict |
| */ |
| public DefaultPOSContextGenerator(int cacheSize, Dictionary dict) { |
| this.dict = dict; |
| |
| if (cacheSize > 0) { |
| contextsCache = new Cache<>(cacheSize); |
| } |
| } |
| |
| protected static String[] getPrefixes(String lex) { |
| String[] prefs = new String[PREFIX_LENGTH]; |
| for (int li = 0; li < PREFIX_LENGTH; li++) { |
| prefs[li] = lex.substring(0, StrictMath.min(li + 1, lex.length())); |
| } |
| return prefs; |
| } |
| |
| protected static String[] getSuffixes(String lex) { |
| String[] suffs = new String[SUFFIX_LENGTH]; |
| for (int li = 0; li < SUFFIX_LENGTH; li++) { |
| suffs[li] = lex.substring(StrictMath.max(lex.length() - li - 1, 0)); |
| } |
| return suffs; |
| } |
| |
| public String[] getContext(int index, String[] sequence, String[] priorDecisions, |
| Object[] additionalContext) { |
| return getContext(index,sequence,priorDecisions); |
| } |
| |
| /** |
| * Returns the context for making a pos tag decision at the specified token index |
| * given the specified tokens and previous tags. |
| * @param index The index of the token for which the context is provided. |
| * @param tokens The tokens in the sentence. |
| * @param tags The tags assigned to the previous words in the sentence. |
| * @return The context for making a pos tag decision at the specified token index |
| * given the specified tokens and previous tags. |
| */ |
| public String[] getContext(int index, Object[] tokens, String[] tags) { |
| String next, nextnext = null, lex, prev, prevprev = null; |
| String tagprev, tagprevprev; |
| tagprev = tagprevprev = null; |
| |
| lex = tokens[index].toString(); |
| if (tokens.length > index + 1) { |
| next = tokens[index + 1].toString(); |
| if (tokens.length > index + 2) |
| nextnext = tokens[index + 2].toString(); |
| else |
| nextnext = SE; // Sentence End |
| |
| } |
| else { |
| next = SE; // Sentence End |
| } |
| |
| if (index - 1 >= 0) { |
| prev = tokens[index - 1].toString(); |
| tagprev = tags[index - 1]; |
| |
| if (index - 2 >= 0) { |
| prevprev = tokens[index - 2].toString(); |
| tagprevprev = tags[index - 2]; |
| } |
| else { |
| prevprev = SB; // Sentence Beginning |
| } |
| } |
| else { |
| prev = SB; // Sentence Beginning |
| } |
| String cacheKey = index + tagprev + tagprevprev; |
| if (contextsCache != null) { |
| if (wordsKey == tokens) { |
| String[] cachedContexts = contextsCache.get(cacheKey); |
| if (cachedContexts != null) { |
| return cachedContexts; |
| } |
| } |
| else { |
| contextsCache.clear(); |
| wordsKey = tokens; |
| } |
| } |
| List<String> e = new ArrayList<>(); |
| e.add("default"); |
| // add the word itself |
| e.add("w=" + lex); |
| |
| if (dict == null || !dict.contains(new StringList(lex))) { |
| // do some basic suffix analysis |
| String[] suffs = getSuffixes(lex); |
| for (int i = 0; i < suffs.length; i++) { |
| e.add("suf=" + suffs[i]); |
| } |
| |
| String[] prefs = getPrefixes(lex); |
| for (int i = 0; i < prefs.length; i++) { |
| e.add("pre=" + prefs[i]); |
| } |
| // see if the word has any special characters |
| if (lex.indexOf('-') != -1) { |
| e.add("h"); |
| } |
| |
| if (hasCap.matcher(lex).find()) { |
| e.add("c"); |
| } |
| |
| if (hasNum.matcher(lex).find()) { |
| e.add("d"); |
| } |
| } |
| // add the words and pos's of the surrounding context |
| if (prev != null) { |
| e.add("p=" + prev); |
| if (tagprev != null) { |
| e.add("t=" + tagprev); |
| } |
| if (prevprev != null) { |
| e.add("pp=" + prevprev); |
| if (tagprevprev != null) { |
| e.add("t2=" + tagprevprev + "," + tagprev); |
| } |
| } |
| } |
| |
| if (next != null) { |
| e.add("n=" + next); |
| if (nextnext != null) { |
| e.add("nn=" + nextnext); |
| } |
| } |
| String[] contexts = e.toArray(new String[e.size()]); |
| if (contextsCache != null) { |
| contextsCache.put(cacheKey,contexts); |
| } |
| return contexts; |
| } |
| |
| } |