opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java - opennlp - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.tools.lemmatizer;

 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;

 /**
  * Lemmatize by simple dictionary lookup into a hashmap built from a file
  * containing, for each line, word\tablemma\tabpostag.
  * @version 2014-07-08
  */
 public class DictionaryLemmatizer implements Lemmatizer {

   /**
    * The hashmap containing the dictionary.
    */
   private final HashMap<List<String>, String> dictMap;

   /**
    * Construct a hashmap from the input tab separated dictionary.
    *
    * The input file should have, for each line, word\tablemma\tabpostag
    *
    * @param dictionary
    *          the input dictionary via inputstream
    */
   public DictionaryLemmatizer(final InputStream dictionary) {
     this.dictMap = new HashMap<List<String>, String>();
     final BufferedReader breader = new BufferedReader(new InputStreamReader(
         dictionary));
     String line;
     try {
       while ((line = breader.readLine()) != null) {
         final String[] elems = line.split("\t");
         this.dictMap.put(Arrays.asList(elems[0], elems[2]), elems[1]);
       }
     } catch (final IOException e) {
       e.printStackTrace();
     }
   }

   /**
    * Get the Map containing the dictionary.
    *
    * @return dictMap the Map
    */
   public HashMap<List<String>, String> getDictMap() {
     return this.dictMap;
   }

   /**
    * Get the dictionary keys (word and postag).
    *
    * @param word
    *          the surface form word
    * @param postag
    *          the assigned postag
    * @return returns the dictionary keys
    */
   private List<String> getDictKeys(final String word, final String postag) {
     final List<String> keys = new ArrayList<String>();
     keys.addAll(Arrays.asList(word.toLowerCase(), postag));
     return keys;
   }

   public String[] lemmatize(final String[] tokens, final String[] postags) {
     List<String> lemmas = new ArrayList<String>();
     for (int i = 0; i < tokens.length; i++) {
       lemmas.add(this.apply(tokens[i], postags[i]));
     }
     return lemmas.toArray(new String[lemmas.size()]);
   }

   /**
    * Lookup lemma in a dictionary. Outputs "O" if not found.
    * @param word the token
    * @param postag the postag
    * @return the lemma
    */
   public String apply(final String word, final String postag) {
     String lemma = null;
     final List<String> keys = this.getDictKeys(word, postag);
     // lookup lemma as value of the map
     final String keyValue = this.dictMap.get(keys);
     if (keyValue != null) {
       lemma = keyValue;
     } else {
       lemma = "O";
     }
     return lemma;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.tools.lemmatizer;

	import java.io.BufferedReader;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.InputStreamReader;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.HashMap;
	import java.util.List;

	/**
	* Lemmatize by simple dictionary lookup into a hashmap built from a file
	* containing, for each line, word\tablemma\tabpostag.
	* @version 2014-07-08
	*/
	public class DictionaryLemmatizer implements Lemmatizer {

	/**
	* The hashmap containing the dictionary.
	*/
	private final HashMap<List<String>, String> dictMap;

	/**
	* Construct a hashmap from the input tab separated dictionary.
	*
	* The input file should have, for each line, word\tablemma\tabpostag
	*
	* @param dictionary
	* the input dictionary via inputstream
	*/
	public DictionaryLemmatizer(final InputStream dictionary) {
	this.dictMap = new HashMap<List<String>, String>();
	final BufferedReader breader = new BufferedReader(new InputStreamReader(
	dictionary));
	String line;
	try {
	while ((line = breader.readLine()) != null) {
	final String[] elems = line.split("\t");
	this.dictMap.put(Arrays.asList(elems[0], elems[2]), elems[1]);
	}
	} catch (final IOException e) {
	e.printStackTrace();
	}
	}

	/**
	* Get the Map containing the dictionary.
	*
	* @return dictMap the Map
	*/
	public HashMap<List<String>, String> getDictMap() {
	return this.dictMap;
	}

	/**
	* Get the dictionary keys (word and postag).
	*
	* @param word
	* the surface form word
	* @param postag
	* the assigned postag
	* @return returns the dictionary keys
	*/
	private List<String> getDictKeys(final String word, final String postag) {
	final List<String> keys = new ArrayList<String>();
	keys.addAll(Arrays.asList(word.toLowerCase(), postag));
	return keys;
	}

	public String[] lemmatize(final String[] tokens, final String[] postags) {
	List<String> lemmas = new ArrayList<String>();
	for (int i = 0; i < tokens.length; i++) {
	lemmas.add(this.apply(tokens[i], postags[i]));
	}
	return lemmas.toArray(new String[lemmas.size()]);
	}

	/**
	* Lookup lemma in a dictionary. Outputs "O" if not found.
	* @param word the token
	* @param postag the postag
	* @return the lemma
	*/
	public String apply(final String word, final String postag) {
	String lemma = null;
	final List<String> keys = this.getDictKeys(word, postag);
	// lookup lemma as value of the map
	final String keyValue = this.dictMap.get(keys);
	if (keyValue != null) {
	lemma = keyValue;
	} else {
	lemma = "O";
	}
	return lemma;
	}
	}