blob: 815f35e518c155302f1d02cbd02c471292a64810 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.lemmatizer;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
/**
* Lemmatize by simple dictionary lookup into a hashmap built from a file
* containing, for each line, word\tablemma\tabpostag.
* @version 2014-07-08
*/
public class DictionaryLemmatizer implements Lemmatizer {
/**
* The hashmap containing the dictionary.
*/
private final HashMap<List<String>, String> dictMap;
/**
* Construct a hashmap from the input tab separated dictionary.
*
* The input file should have, for each line, word\tablemma\tabpostag
*
* @param dictionary
* the input dictionary via inputstream
*/
public DictionaryLemmatizer(final InputStream dictionary) {
this.dictMap = new HashMap<List<String>, String>();
final BufferedReader breader = new BufferedReader(new InputStreamReader(
dictionary));
String line;
try {
while ((line = breader.readLine()) != null) {
final String[] elems = line.split("\t");
this.dictMap.put(Arrays.asList(elems[0], elems[2]), elems[1]);
}
} catch (final IOException e) {
e.printStackTrace();
}
}
/**
* Get the Map containing the dictionary.
*
* @return dictMap the Map
*/
public HashMap<List<String>, String> getDictMap() {
return this.dictMap;
}
/**
* Get the dictionary keys (word and postag).
*
* @param word
* the surface form word
* @param postag
* the assigned postag
* @return returns the dictionary keys
*/
private List<String> getDictKeys(final String word, final String postag) {
final List<String> keys = new ArrayList<String>();
keys.addAll(Arrays.asList(word.toLowerCase(), postag));
return keys;
}
public String[] lemmatize(final String[] tokens, final String[] postags) {
List<String> lemmas = new ArrayList<String>();
for (int i = 0; i < tokens.length; i++) {
lemmas.add(this.apply(tokens[i], postags[i]));
}
return lemmas.toArray(new String[lemmas.size()]);
}
/**
* Lookup lemma in a dictionary. Outputs "O" if not found.
* @param word the token
* @param postag the postag
* @return the lemma
*/
public String apply(final String word, final String postag) {
String lemma = null;
final List<String> keys = this.getDictKeys(word, postag);
// lookup lemma as value of the map
final String keyValue = this.dictMap.get(keys);
if (keyValue != null) {
lemma = keyValue;
} else {
lemma = "O";
}
return lemma;
}
}