nlpcraft-stanford/src/main/scala/org/apache/nlpcraft/server/nlp/core/stanford/NCStanfordNerEnricher.scala - incubator-nlpcraft - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nlpcraft.server.nlp.core.stanford

 import edu.stanford.nlp.ling.CoreAnnotations.NormalizedNamedEntityTagAnnotation
 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common.NCService
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote}
 import org.apache.nlpcraft.server.ignite.NCIgniteInstance
 import org.apache.nlpcraft.server.nlp.core.NCNlpNerEnricher

 import scala.collection.JavaConverters._

 /**
   * Stanford NLP NER enricher.
   */
 object NCStanfordNerEnricher extends NCService with NCNlpNerEnricher with NCIgniteInstance {
     /**
      *
      * @param parent Optional parent span.
      * @return
      */
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { span ⇒
         // Should be started even if another NLP engine configured.
         if (!NCStanfordCoreManager.isStarted)
             NCStanfordCoreManager.start(span)

         ackStart()
     }

     /**
      *
      * @param parent Optional parent span.
      */
     override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { span ⇒
         if (NCStanfordCoreManager.isStarted)
             NCStanfordCoreManager.stop(span)

         ackStop()
     }

     /**
      *
      * @param ns
      * @param ebiTokens Set of enabled built-in token IDs.
      */
     override def enrich(ns: NCNlpSentence, ebiTokens: Set[String], parent: Span = null): Unit =
         startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, "txt" → ns.text) { _ ⇒
             NCStanfordCoreManager.
                 annotate(ns.text).
                 entityMentions().asScala.
                 filter(e ⇒ ebiTokens.contains(e.entityType().toLowerCase)).
                 foreach(e ⇒ {
                     val offsets = e.charOffsets()

                     val t1 = ns.find(_.startCharIndex == offsets.first)
                     val t2 = ns.find(_.endCharIndex == offsets.second)

                     if (t1.nonEmpty && t2.nonEmpty) {
                         val buf = collection.mutable.ArrayBuffer.empty[(String, Any)]

                         val nne = e.coreMap().get(classOf[NormalizedNamedEntityTagAnnotation])

                         if (nne != null)
                             buf += "nne" → nne

                         val conf = e.entityTypeConfidences()

                         // Key ignored because it can be category with higher level (`location` for type `country`)
                         if (conf.size() == 1)
                             buf += "confidence" → conf.asScala.head._2

                         val typ = e.entityType().toLowerCase

                         val i1 = t1.get.startCharIndex
                         val i2 = t2.get.endCharIndex
                         val toks = ns.filter(t ⇒ t.startCharIndex >= i1 && t.endCharIndex <= i2)

                         val note = NCNlpSentenceNote(toks.map(_.index), s"stanford:$typ", buf: _*)

                         toks.foreach(_.add(note))
                     }
                 })
         }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nlpcraft.server.nlp.core.stanford

	import edu.stanford.nlp.ling.CoreAnnotations.NormalizedNamedEntityTagAnnotation
	import io.opencensus.trace.Span
	import org.apache.nlpcraft.common.NCService
	import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote}
	import org.apache.nlpcraft.server.ignite.NCIgniteInstance
	import org.apache.nlpcraft.server.nlp.core.NCNlpNerEnricher

	import scala.collection.JavaConverters._

	/**
	* Stanford NLP NER enricher.
	*/
	object NCStanfordNerEnricher extends NCService with NCNlpNerEnricher with NCIgniteInstance {
	/**
	*
	* @param parent Optional parent span.
	* @return
	*/
	override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { span ⇒
	// Should be started even if another NLP engine configured.
	if (!NCStanfordCoreManager.isStarted)
	NCStanfordCoreManager.start(span)

	ackStart()
	}

	/**
	*
	* @param parent Optional parent span.
	*/
	override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { span ⇒
	if (NCStanfordCoreManager.isStarted)
	NCStanfordCoreManager.stop(span)

	ackStop()
	}

	/**
	*
	* @param ns
	* @param ebiTokens Set of enabled built-in token IDs.
	*/
	override def enrich(ns: NCNlpSentence, ebiTokens: Set[String], parent: Span = null): Unit =
	startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, "txt" → ns.text) { _ ⇒
	NCStanfordCoreManager.
	annotate(ns.text).
	entityMentions().asScala.
	filter(e ⇒ ebiTokens.contains(e.entityType().toLowerCase)).
	foreach(e ⇒ {
	val offsets = e.charOffsets()

	val t1 = ns.find(_.startCharIndex == offsets.first)
	val t2 = ns.find(_.endCharIndex == offsets.second)

	if (t1.nonEmpty && t2.nonEmpty) {
	val buf = collection.mutable.ArrayBuffer.empty[(String, Any)]

	val nne = e.coreMap().get(classOf[NormalizedNamedEntityTagAnnotation])

	if (nne != null)
	buf += "nne" → nne

	val conf = e.entityTypeConfidences()

	// Key ignored because it can be category with higher level (`location` for type `country`)
	if (conf.size() == 1)
	buf += "confidence" → conf.asScala.head._2

	val typ = e.entityType().toLowerCase

	val i1 = t1.get.startCharIndex
	val i2 = t2.get.endCharIndex
	val toks = ns.filter(t ⇒ t.startCharIndex >= i1 && t.endCharIndex <= i2)

	val note = NCNlpSentenceNote(toks.map(_.index), s"stanford:$typ", buf: _*)

	toks.foreach(_.add(note))
	}
	})
	}
	}