blob: e663f782eb18e137d135977f0215ad3b4802e0d8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.lemmatizer;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
/**
* Simple feature generator for learning statistical lemmatizers.
* Features based on Grzegorz ChrupaƂa. 2008. Towards a Machine-Learning
* Architecture for Lexical Functional Grammar Parsing. PhD dissertation,
* Dublin City University
* @version 2016-02-15
*/
public class DefaultLemmatizerContextGenerator implements LemmatizerContextGenerator {
private static final int PREFIX_LENGTH = 5;
private static final int SUFFIX_LENGTH = 7;
private static Pattern hasCap = Pattern.compile("[A-Z]");
private static Pattern hasNum = Pattern.compile("[0-9]");
public DefaultLemmatizerContextGenerator() {
}
protected static String[] getPrefixes(String lex) {
String[] prefs = new String[PREFIX_LENGTH];
for (int li = 1; li < PREFIX_LENGTH; li++) {
prefs[li] = lex.substring(0, StrictMath.min(li + 1, lex.length()));
}
return prefs;
}
protected static String[] getSuffixes(String lex) {
String[] suffs = new String[SUFFIX_LENGTH];
for (int li = 1; li < SUFFIX_LENGTH; li++) {
suffs[li] = lex.substring(StrictMath.max(lex.length() - li - 1, 0));
}
return suffs;
}
public String[] getContext(int index, String[] sequence, String[] priorDecisions,
Object[] additionalContext) {
return getContext(index, sequence, (String[]) additionalContext[0], priorDecisions);
}
public String[] getContext(int index, String[] toks, String[] tags, String[] preds) {
// Word
String w0;
// Tag
String t0;
// Previous prediction
String p_1;
String lex = toks[index];
if (index < 1) {
p_1 = "p_1=bos";
}
else {
p_1 = "p_1=" + preds[index - 1];
}
w0 = "w0=" + toks[index];
t0 = "t0=" + tags[index];
List<String> features = new ArrayList<>();
features.add(w0);
features.add(t0);
features.add(p_1);
features.add(p_1 + t0);
features.add(p_1 + w0);
// do some basic suffix analysis
String[] suffs = getSuffixes(lex);
for (int i = 0; i < suffs.length; i++) {
features.add("suf=" + suffs[i]);
}
String[] prefs = getPrefixes(lex);
for (int i = 0; i < prefs.length; i++) {
features.add("pre=" + prefs[i]);
}
// see if the word has any special characters
if (lex.indexOf('-') != -1) {
features.add("h");
}
if (hasCap.matcher(lex).find()) {
features.add("c");
}
if (hasNum.matcher(lex).find()) {
features.add("d");
}
return features.toArray(new String[features.size()]);
}
}