nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/wordnet/NCWordNetManager.scala - incubator-nlpcraft - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nlpcraft.server.nlp.wordnet

 import io.opencensus.trace.Span
 import net.sf.extjwnl.data.POS._
 import net.sf.extjwnl.data.{IndexWord, POS, PointerType}
 import net.sf.extjwnl.dictionary.{Dictionary, MorphologicalProcessor}
 import org.apache.nlpcraft.common._
 import org.apache.nlpcraft.common.NCService

 import scala.collection.JavaConverters._

 /**
   * WordNet manager.
   */
 object NCWordNetManager extends NCService {
     @volatile private var dic: Dictionary = _
     @volatile private var morph: MorphologicalProcessor = _

     private def pennPos2WordNet(pennPos: String): Option[POS] =
         pennPos.head match {
             case 'N' ⇒ Some(NOUN)
             case 'V' ⇒ Some(VERB)
             case 'J' ⇒ Some(ADJECTIVE)
             case 'R' ⇒ Some(ADVERB)

             case _ ⇒ None
         }

     // Process WordNet formatted multi-word entries (they are split with '_').
     private def normalize(str: String) = str.replaceAll("_", " ")

     // Converts words.
     private def convert(str: String, initPos: POS, targetPos: POS): Seq[String] = {
         val word = dic.getIndexWord(initPos, str)

         if (word != null)
             word.getSenses.asScala.flatMap(synset ⇒
                 synset.getPointers(PointerType.DERIVATION).asScala.flatMap(p ⇒ {
                     val trg = p.getTargetSynset

                     if (trg.getPOS == targetPos)
                         trg.getWords.asScala.map(p ⇒ normalize(p.getLemma))
                     else
                         Seq.empty
                 })
             ).distinct
         else
             Seq.empty[String]
     }

     /**
      *
      * @param parent Optional parent span.
      * @throws NCE
      * @return
      */
     @throws[NCE]
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
         ackStarting()

         dic =  Dictionary.getDefaultResourceInstance
         morph = dic.getMorphologicalProcessor

         ackStarted()
     }

     /**
      *
      * @param parent Optional parent span.
      */
     override def stop(parent: Span): Unit = startScopedSpan("stop", parent) { _ ⇒
         ackStopping()
         ackStopped()
     }

     /**
       * Gets a sequence of possible nouns relatives for the given adjective.
       *
       * @param adj An adjective to match.
       * @return A number of possible noun relatives.
       */
     def getNNsForJJ(adj: String): Seq[String] = convert(adj, ADJECTIVE, NOUN)

     /**
       * Gets a sequence of possible adjective relatives for the given noun.
       *
       * @param noun A noun to match.
       * @return A number of possible adjective relatives.
       */
     def getJJsForNN(noun: String): Seq[String] = convert(noun, NOUN, ADJECTIVE)

     /**
       * Gets base form using more precision method.
       *
       * It drops base form like 'Alice'→'louse', 'God'→'od' and 'better'→'well'
       * which produced by WordNet if the exact base form not found.
       *
       * @param lemma Lemma to get a WordNet base form.
       * @param pennPos Lemma's Penn Treebank POS tag.
       */
     def getBaseForm(lemma: String, pennPos: String, syns: Set[String] = null): String =
         pennPos2WordNet(pennPos) match {
             case Some(wnPos) ⇒
                 morph.lookupBaseForm(wnPos, lemma) match {
                     case wnWord: IndexWord ⇒
                         val wnLemma = wnWord.getLemma
                         val synonyms = if (syns == null) getSynonyms(lemma, pennPos).flatten.toSet else syns

                         if (synonyms.contains(wnLemma))
                             wnLemma
                         else
                             lemma
                     case null ⇒ lemma
                 }

             // For unsupported POS tags - return the input lemma.
             case None ⇒ lemma
         }

     /**
       * Gets synonyms for given lemma and its POS tag.
       *
       * @param lemma Lemma to find synonyms for.
       * @param pennPos Lemma's Penn Treebank POS tag.
       */
     def getSynonyms(lemma: String, pennPos: String): Seq[Seq[String]] = {
         val res: Seq[Seq[String]] = pennPos2WordNet(pennPos) match {
             case Some(wnPos) ⇒
                 val wnWord = dic.lookupIndexWord(wnPos, lemma)

                 if (wnWord == null)
                     Seq.empty
                 else
                     wnWord.getSynsetOffsets match {
                         case synsOffs: Array[Long] ⇒
                             synsOffs.
                                 map(dic.getSynsetAt(wnPos, _)).
                                 filter(_.getPOS == wnPos).
                                 map(
                                     _.getWords.asScala.
                                         map(_.getLemma.toLowerCase).
                                         filter(_ != lemma).
                                         map(normalize).toSeq
                                 )

                         case null ⇒ Seq.empty
                     }

             // Invalid POS.
             case None ⇒ Seq.empty
         }

         res.filter(_.nonEmpty)
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nlpcraft.server.nlp.wordnet

	import io.opencensus.trace.Span
	import net.sf.extjwnl.data.POS._
	import net.sf.extjwnl.data.{IndexWord, POS, PointerType}
	import net.sf.extjwnl.dictionary.{Dictionary, MorphologicalProcessor}
	import org.apache.nlpcraft.common._
	import org.apache.nlpcraft.common.NCService

	import scala.collection.JavaConverters._

	/**
	* WordNet manager.
	*/
	object NCWordNetManager extends NCService {
	@volatile private var dic: Dictionary = _
	@volatile private var morph: MorphologicalProcessor = _

	private def pennPos2WordNet(pennPos: String): Option[POS] =
	pennPos.head match {
	case 'N' ⇒ Some(NOUN)
	case 'V' ⇒ Some(VERB)
	case 'J' ⇒ Some(ADJECTIVE)
	case 'R' ⇒ Some(ADVERB)

	case _ ⇒ None
	}

	// Process WordNet formatted multi-word entries (they are split with '_').
	private def normalize(str: String) = str.replaceAll("_", " ")

	// Converts words.
	private def convert(str: String, initPos: POS, targetPos: POS): Seq[String] = {
	val word = dic.getIndexWord(initPos, str)

	if (word != null)
	word.getSenses.asScala.flatMap(synset ⇒
	synset.getPointers(PointerType.DERIVATION).asScala.flatMap(p ⇒ {
	val trg = p.getTargetSynset

	if (trg.getPOS == targetPos)
	trg.getWords.asScala.map(p ⇒ normalize(p.getLemma))
	else
	Seq.empty
	})
	).distinct
	else
	Seq.empty[String]
	}

	/**
	*
	* @param parent Optional parent span.
	* @throws NCE
	* @return
	*/
	@throws[NCE]
	override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
	ackStarting()

	dic = Dictionary.getDefaultResourceInstance
	morph = dic.getMorphologicalProcessor

	ackStarted()
	}

	/**
	*
	* @param parent Optional parent span.
	*/
	override def stop(parent: Span): Unit = startScopedSpan("stop", parent) { _ ⇒
	ackStopping()
	ackStopped()
	}

	/**
	* Gets a sequence of possible nouns relatives for the given adjective.
	*
	* @param adj An adjective to match.
	* @return A number of possible noun relatives.
	*/
	def getNNsForJJ(adj: String): Seq[String] = convert(adj, ADJECTIVE, NOUN)

	/**
	* Gets a sequence of possible adjective relatives for the given noun.
	*
	* @param noun A noun to match.
	* @return A number of possible adjective relatives.
	*/
	def getJJsForNN(noun: String): Seq[String] = convert(noun, NOUN, ADJECTIVE)

	/**
	* Gets base form using more precision method.
	*
	* It drops base form like 'Alice'→'louse', 'God'→'od' and 'better'→'well'
	* which produced by WordNet if the exact base form not found.
	*
	* @param lemma Lemma to get a WordNet base form.
	* @param pennPos Lemma's Penn Treebank POS tag.
	*/
	def getBaseForm(lemma: String, pennPos: String, syns: Set[String] = null): String =
	pennPos2WordNet(pennPos) match {
	case Some(wnPos) ⇒
	morph.lookupBaseForm(wnPos, lemma) match {
	case wnWord: IndexWord ⇒
	val wnLemma = wnWord.getLemma
	val synonyms = if (syns == null) getSynonyms(lemma, pennPos).flatten.toSet else syns

	if (synonyms.contains(wnLemma))
	wnLemma
	else
	lemma
	case null ⇒ lemma
	}

	// For unsupported POS tags - return the input lemma.
	case None ⇒ lemma
	}

	/**
	* Gets synonyms for given lemma and its POS tag.
	*
	* @param lemma Lemma to find synonyms for.
	* @param pennPos Lemma's Penn Treebank POS tag.
	*/
	def getSynonyms(lemma: String, pennPos: String): Seq[Seq[String]] = {
	val res: Seq[Seq[String]] = pennPos2WordNet(pennPos) match {
	case Some(wnPos) ⇒
	val wnWord = dic.lookupIndexWord(wnPos, lemma)

	if (wnWord == null)
	Seq.empty
	else
	wnWord.getSynsetOffsets match {
	case synsOffs: Array[Long] ⇒
	synsOffs.
	map(dic.getSynsetAt(wnPos, _)).
	filter(_.getPOS == wnPos).
	map(
	_.getWords.asScala.
	map(_.getLemma.toLowerCase).
	filter(_ != lemma).
	map(normalize).toSeq
	)

	case null ⇒ Seq.empty
	}

	// Invalid POS.
	case None ⇒ Seq.empty
	}

	res.filter(_.nonEmpty)
	}
	}