jwnl-addon/src/main/java/opennlp/jwnl/lemmatizer/JWNLLemmatizer.java - opennlp-addons - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package opennlp.jwnl.lemmatizer;

 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;

 import opennlp.tools.lemmatizer.DictionaryLemmatizer;

 import net.didion.jwnl.JWNLException;
 import net.didion.jwnl.data.Adjective;
 import net.didion.jwnl.data.FileDictionaryElementFactory;
 import net.didion.jwnl.data.IndexWord;
 import net.didion.jwnl.data.POS;
 import net.didion.jwnl.data.PointerType;
 import net.didion.jwnl.data.VerbFrame;
 import net.didion.jwnl.dictionary.FileBackedDictionary;
 import net.didion.jwnl.dictionary.MorphologicalProcessor;
 import net.didion.jwnl.dictionary.file_manager.FileManager;
 import net.didion.jwnl.dictionary.file_manager.FileManagerImpl;
 import net.didion.jwnl.dictionary.morph.DefaultMorphologicalProcessor;
 import net.didion.jwnl.dictionary.morph.DetachSuffixesOperation;
 import net.didion.jwnl.dictionary.morph.LookupExceptionsOperation;
 import net.didion.jwnl.dictionary.morph.LookupIndexWordOperation;
 import net.didion.jwnl.dictionary.morph.Operation;
 import net.didion.jwnl.dictionary.morph.TokenizerOperation;
 import net.didion.jwnl.princeton.data.PrincetonWN17FileDictionaryElementFactory;
 import net.didion.jwnl.princeton.file.PrincetonRandomAccessDictionaryFile;

 public class JWNLLemmatizer implements DictionaryLemmatizer {

   private net.didion.jwnl.dictionary.Dictionary dict;
   private MorphologicalProcessor morphy;

   /**
    * Creates JWNL dictionary and morphological processor objects in
    * JWNLemmatizer constructor. It also loads the JWNL configuration into the
    * constructor.
    *
    * Constructor code based on Apache OpenNLP JWNLDictionary class.
    *
    * @param wnDirectory
    * @throws IOException
    * @throws JWNLException
    */
   public JWNLLemmatizer(String wnDirectory) throws IOException, JWNLException {
     PointerType.initialize();
     Adjective.initialize();
     VerbFrame.initialize();
     Map<POS, String[][]> suffixMap = new HashMap<POS, String[][]>();
     suffixMap.put(POS.NOUN, new String[][] { { "s", "" }, { "ses", "s" },
         { "xes", "x" }, { "zes", "z" }, { "ches", "ch" }, { "shes", "sh" },
         { "men", "man" }, { "ies", "y" } });
     suffixMap.put(POS.VERB, new String[][] { { "s", "" }, { "ies", "y" },
         { "es", "e" }, { "es", "" }, { "ed", "e" }, { "ed", "" },
         { "ing", "e" }, { "ing", "" } });
     suffixMap.put(POS.ADJECTIVE, new String[][] { { "er", "" }, { "est", "" },
         { "er", "e" }, { "est", "e" } });
     DetachSuffixesOperation tokDso = new DetachSuffixesOperation(suffixMap);
     tokDso.addDelegate(DetachSuffixesOperation.OPERATIONS, new Operation[] {
         new LookupIndexWordOperation(), new LookupExceptionsOperation() });
     TokenizerOperation tokOp = new TokenizerOperation(new String[] { " ", "-" });
     tokOp.addDelegate(TokenizerOperation.TOKEN_OPERATIONS,
         new Operation[] { new LookupIndexWordOperation(),
             new LookupExceptionsOperation(), tokDso });
     DetachSuffixesOperation morphDso = new DetachSuffixesOperation(suffixMap);
     morphDso.addDelegate(DetachSuffixesOperation.OPERATIONS, new Operation[] {
         new LookupIndexWordOperation(), new LookupExceptionsOperation() });
     Operation[] operations = { new LookupExceptionsOperation(), morphDso, tokOp };
     morphy = new DefaultMorphologicalProcessor(operations);
     FileManager manager = new FileManagerImpl(wnDirectory,
         PrincetonRandomAccessDictionaryFile.class);
     FileDictionaryElementFactory factory = new PrincetonWN17FileDictionaryElementFactory();
     FileBackedDictionary.install(manager, morphy, factory, true);
     dict = net.didion.jwnl.dictionary.Dictionary.getInstance();
     morphy = dict.getMorphologicalProcessor();
   }


   /**
    * It takes a word and a POS tag and obtains a word's lemma from WordNet.
    *
    * @param word
    * @param postag
    * @return lemma
    */
   public String lemmatize(String word, String postag) {
     String constantTag = "NNP";
     IndexWord baseForm;
     String lemma = null;
     try {
       POS pos;
       if (postag.startsWith("N") || postag.startsWith("n")) {
         pos = POS.NOUN;
       } else if (postag.startsWith("V") || postag.startsWith("v")) {
         pos = POS.VERB;
       } else if (postag.startsWith("J") || postag.startsWith("a")) {
         pos = POS.ADJECTIVE;
       } else if (postag.startsWith("RB") || postag.startsWith("r")) {
         pos = POS.ADVERB;
       } else {
         pos = POS.ADVERB;
       }
       baseForm = morphy.lookupBaseForm(pos, word);
       if (baseForm != null) {
         lemma = baseForm.getLemma().toString();
       }
       else if (baseForm == null && postag.startsWith(String.valueOf(constantTag))) {
           lemma = word;
         }
         else {
           lemma= word.toLowerCase();
         }
     } catch (JWNLException e) {
       e.printStackTrace();
       return null;
     }
     return lemma;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package opennlp.jwnl.lemmatizer;

	import java.io.IOException;
	import java.util.HashMap;
	import java.util.Map;

	import opennlp.tools.lemmatizer.DictionaryLemmatizer;

	import net.didion.jwnl.JWNLException;
	import net.didion.jwnl.data.Adjective;
	import net.didion.jwnl.data.FileDictionaryElementFactory;
	import net.didion.jwnl.data.IndexWord;
	import net.didion.jwnl.data.POS;
	import net.didion.jwnl.data.PointerType;
	import net.didion.jwnl.data.VerbFrame;
	import net.didion.jwnl.dictionary.FileBackedDictionary;
	import net.didion.jwnl.dictionary.MorphologicalProcessor;
	import net.didion.jwnl.dictionary.file_manager.FileManager;
	import net.didion.jwnl.dictionary.file_manager.FileManagerImpl;
	import net.didion.jwnl.dictionary.morph.DefaultMorphologicalProcessor;
	import net.didion.jwnl.dictionary.morph.DetachSuffixesOperation;
	import net.didion.jwnl.dictionary.morph.LookupExceptionsOperation;
	import net.didion.jwnl.dictionary.morph.LookupIndexWordOperation;
	import net.didion.jwnl.dictionary.morph.Operation;
	import net.didion.jwnl.dictionary.morph.TokenizerOperation;
	import net.didion.jwnl.princeton.data.PrincetonWN17FileDictionaryElementFactory;
	import net.didion.jwnl.princeton.file.PrincetonRandomAccessDictionaryFile;

	public class JWNLLemmatizer implements DictionaryLemmatizer {

	private net.didion.jwnl.dictionary.Dictionary dict;
	private MorphologicalProcessor morphy;

	/**
	* Creates JWNL dictionary and morphological processor objects in
	* JWNLemmatizer constructor. It also loads the JWNL configuration into the
	* constructor.
	*
	* Constructor code based on Apache OpenNLP JWNLDictionary class.
	*
	* @param wnDirectory
	* @throws IOException
	* @throws JWNLException
	*/
	public JWNLLemmatizer(String wnDirectory) throws IOException, JWNLException {
	PointerType.initialize();
	Adjective.initialize();
	VerbFrame.initialize();
	Map<POS, String[][]> suffixMap = new HashMap<POS, String[][]>();
	suffixMap.put(POS.NOUN, new String[][] { { "s", "" }, { "ses", "s" },
	{ "xes", "x" }, { "zes", "z" }, { "ches", "ch" }, { "shes", "sh" },
	{ "men", "man" }, { "ies", "y" } });
	suffixMap.put(POS.VERB, new String[][] { { "s", "" }, { "ies", "y" },
	{ "es", "e" }, { "es", "" }, { "ed", "e" }, { "ed", "" },
	{ "ing", "e" }, { "ing", "" } });
	suffixMap.put(POS.ADJECTIVE, new String[][] { { "er", "" }, { "est", "" },
	{ "er", "e" }, { "est", "e" } });
	DetachSuffixesOperation tokDso = new DetachSuffixesOperation(suffixMap);
	tokDso.addDelegate(DetachSuffixesOperation.OPERATIONS, new Operation[] {
	new LookupIndexWordOperation(), new LookupExceptionsOperation() });
	TokenizerOperation tokOp = new TokenizerOperation(new String[] { " ", "-" });
	tokOp.addDelegate(TokenizerOperation.TOKEN_OPERATIONS,
	new Operation[] { new LookupIndexWordOperation(),
	new LookupExceptionsOperation(), tokDso });
	DetachSuffixesOperation morphDso = new DetachSuffixesOperation(suffixMap);
	morphDso.addDelegate(DetachSuffixesOperation.OPERATIONS, new Operation[] {
	new LookupIndexWordOperation(), new LookupExceptionsOperation() });
	Operation[] operations = { new LookupExceptionsOperation(), morphDso, tokOp };
	morphy = new DefaultMorphologicalProcessor(operations);
	FileManager manager = new FileManagerImpl(wnDirectory,
	PrincetonRandomAccessDictionaryFile.class);
	FileDictionaryElementFactory factory = new PrincetonWN17FileDictionaryElementFactory();
	FileBackedDictionary.install(manager, morphy, factory, true);
	dict = net.didion.jwnl.dictionary.Dictionary.getInstance();
	morphy = dict.getMorphologicalProcessor();
	}


	/**
	* It takes a word and a POS tag and obtains a word's lemma from WordNet.
	*
	* @param word
	* @param postag
	* @return lemma
	*/
	public String lemmatize(String word, String postag) {
	String constantTag = "NNP";
	IndexWord baseForm;
	String lemma = null;
	try {
	POS pos;
	if (postag.startsWith("N") \|\| postag.startsWith("n")) {
	pos = POS.NOUN;
	} else if (postag.startsWith("V") \|\| postag.startsWith("v")) {
	pos = POS.VERB;
	} else if (postag.startsWith("J") \|\| postag.startsWith("a")) {
	pos = POS.ADJECTIVE;
	} else if (postag.startsWith("RB") \|\| postag.startsWith("r")) {
	pos = POS.ADVERB;
	} else {
	pos = POS.ADVERB;
	}
	baseForm = morphy.lookupBaseForm(pos, word);
	if (baseForm != null) {
	lemma = baseForm.getLemma().toString();
	}
	else if (baseForm == null && postag.startsWith(String.valueOf(constantTag))) {
	lemma = word;
	}
	else {
	lemma= word.toLowerCase();
	}
	} catch (JWNLException e) {
	e.printStackTrace();
	return null;
	}
	return lemma;
	}

	}