nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala - incubator-nlpcraft - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.relation

 import java.io.Serializable

 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common.makro.NCMacroParser
 import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
 import org.apache.nlpcraft.common.{NCE, NCService}
 import org.apache.nlpcraft.probe.mgrs.NCProbeModel
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher

 import scala.collection.JavaConverters._
 import scala.collection.{Map, Seq, mutable}

 /**
   * Relation enricher.
   */
 object NCRelationEnricher extends NCProbeEnricher {
     case class Holder(funcType: String, headStem: String, allStems: Set[String])
     case class Match(
         funcType: String,
         matched: Seq[NCNlpSentenceToken],
         matchedHead: NCNlpSentenceToken,
         refNotes: Set[String],
         refIndexes: java.util.List[Int]
     )
     case class Reference(tokens: Seq[NCNlpSentenceToken], types: Set[String]) {
         require(tokens.nonEmpty)
         require(types.nonEmpty)
     }

     private final val TOK_ID = "nlpcraft:relation"

     private final val REL_TYPES = Seq(
         "nlpcraft:continent",
         "nlpcraft:subcontinent",
         "nlpcraft:country",
         "nlpcraft:metro",
         "nlpcraft:region",
         "nlpcraft:city",
         "nlpcraft:date"
     )

     private var FUNCS: Seq[Holder] = _
     private var ALL_FUNC_STEMS: Set[String] = _

     /**
       * Starts this component.
       */
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
         val macros = NCMacroParser()

         FUNCS = {
             val seq = mutable.ArrayBuffer.empty[Holder]

             /**
               *
               * @param f Function ID.
               * @param head Mandatory "start" word(s).
               * @param tails All possible "connecting" words (zero or more can be in the sentence).
               */
             def add(f: String, head: String, tails: Seq[String]): Unit = {
                 require(!head.contains(" "))

                 val tailsSeq = tails.toSet.map(NCNlpCoreManager.stem)

                 macros.expand(head).map(NCNlpCoreManager.stem).foreach(s ⇒ seq += Holder(f, s, tailsSeq + s))
             }

             val tails = Seq(
                 "to",
                 "and",
                 "for",
                 "with",
                 "in",
                 "of",
                 "between",
                 "statistic",
                 "stats",
                 "info",
                 "metrics",
                 "information",
                 "data",
                 "vs",
                 "versus"
             )

             add("compare", "{compare|comparing|contrast|difference|match|vs|versus}", tails)
             add("correlate", "{correlate|correlation|relation|relationship}", tails)

             seq.sortBy(-_.allStems.size)
         }

         ALL_FUNC_STEMS = FUNCS.flatMap(_.allStems).toSet

         super.start()
     }

     override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
         super.stop()
     }

     /**
       * Checks whether important tokens deleted as stopwords or not.
       *
       * @param ns Sentence.
       * @param toks Tokens in which some stopwords can be deleted.
       */
     private def validImportant(ns: NCNlpSentence, toks: Seq[NCNlpSentenceToken]): Boolean = {
         def isImportant(t: NCNlpSentenceToken): Boolean =
             t.exists(n ⇒ n.isUser || REL_TYPES.contains(n.noteType)) || ALL_FUNC_STEMS.contains(t.stem)

         val idxs = toks.map(_.index)

         require(idxs == idxs.sorted)

         val toks2 = ns.slice(idxs.head, idxs.last + 1)

         toks.length == toks2.length || toks.count(isImportant) == toks2.count(isImportant)
     }

     @throws[NCE]
     override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit =
         startScopedSpan("enrich", parent,
             "srvReqId" → ns.srvReqId,
             "mdlId" → mdl.model.getId,
             "txt" → ns.text) { _ ⇒
             // Tries to grab tokens direct way.
             // Example: A, B, C ⇒ ABC, AB, BC .. (AB will be processed first)
             val notes = mutable.HashSet.empty[NCNlpSentenceNote]

             for (toks ← ns.tokenMixWithStopWords() if validImportant(ns, toks))
                 tryToMatch(toks) match {
                     case Some(m) ⇒
                         for (refNote ← m.refNotes) {
                             val note = NCNlpSentenceNote(
                                 Seq(m.matchedHead.index),
                                 TOK_ID,
                                 "type" → m.funcType,
                                 "indexes" → m.refIndexes,
                                 "note" → refNote
                             )

                             if (!notes.exists(n ⇒ ns.notesEqualOrSimilar(n, note))) {
                                 notes += note

                                 m.matched.filter(_ != m.matchedHead).foreach(_.addStopReason(note))

                                 m.matchedHead.add(note)
                             }
                         }
                     case None ⇒ // No-op.
                 }
         }

     /**
       *
       * @param toks
       */
     private def getReference(toks: Seq[NCNlpSentenceToken]): Option[Reference] =
         if (toks.isEmpty)
             None
         else {
             val sortedToks = toks.sortBy(_.index)

             val i1 = sortedToks.head.index
             val i2 = sortedToks.last.index

             val notes =
                 sortedToks.flatMap(_.filter(n ⇒ n.isUser || REL_TYPES.contains(n.noteType))).
                 // Finds notes for tokens related to only given tokens.
                 filter(n ⇒ n.tokenFrom >= i1 && n.tokenTo <= i2)

             val suitNotes =
                 notes.
                     filter(n1 ⇒ notes.exists(
                         n2 ⇒ n2.tokenFrom > n1.tokenTo || n2.tokenTo < n1.tokenFrom)
                     ).
                     groupBy(_.noteType).
                     flatMap { case (_, ns) ⇒ if (ns.size > 1) ns else Seq.empty }

             if (suitNotes.nonEmpty)
                 Some(
                     Reference(
                         toks.filter(t ⇒ suitNotes.exists(t.contains)),
                         suitNotes.map(_.noteType).toSet
                     )
                 )
             else
                 None
         }

     /**
       *
       * @param toks
       */
     private def tryToMatch(toks: Seq[NCNlpSentenceToken]): Option[Match] = {
         val i1 = toks.head.index
         val i2 = toks.last.index

         var refOpts = toks.
             filter(t ⇒
                 t.exists(n ⇒ (
                     n.isUser || REL_TYPES.contains(n.noteType)) &&
                     n.tokenIndexes.head >= i1 &&
                     n.tokenIndexes.last <= i2
                 )
             )
         val matchOpts = toks.diff(refOpts)

         if (refOpts.nonEmpty && matchOpts.nonEmpty)
             getReference(refOpts) match {
                 case Some(r) ⇒
                     refOpts = r.tokens

                     def try0(stems: Set[String]): Option[Match] =
                         FUNCS.
                             flatMap(h ⇒
                                 if (stems.subsetOf(h.allStems) && stems.contains(h.headStem)) Some(h) else None
                             ).
                             headOption match {
                             case Some(h) ⇒
                                 Some(
                                     Match(
                                         h.funcType,
                                         matchOpts,
                                         matchOpts.
                                             find(_.stem == h.headStem).
                                             getOrElse(throw new AssertionError("Missed token.")),
                                         r.types,
                                         refOpts.map(_.index).asJava
                                     )
                                 )
                             case None ⇒ None
                         }

                     try0(matchOpts.map(_.stem).toSet) match {
                         case Some(m) ⇒ Some(m)
                         case None ⇒ try0(matchOpts.filter(!_.isStopWord).map(_.stem).toSet)
                     }

                 case None ⇒ None
             }
         else
             None
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.relation

	import java.io.Serializable

	import io.opencensus.trace.Span
	import org.apache.nlpcraft.common.makro.NCMacroParser
	import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
	import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
	import org.apache.nlpcraft.common.{NCE, NCService}
	import org.apache.nlpcraft.probe.mgrs.NCProbeModel
	import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher

	import scala.collection.JavaConverters._
	import scala.collection.{Map, Seq, mutable}

	/**
	* Relation enricher.
	*/
	object NCRelationEnricher extends NCProbeEnricher {
	case class Holder(funcType: String, headStem: String, allStems: Set[String])
	case class Match(
	funcType: String,
	matched: Seq[NCNlpSentenceToken],
	matchedHead: NCNlpSentenceToken,
	refNotes: Set[String],
	refIndexes: java.util.List[Int]
	)
	case class Reference(tokens: Seq[NCNlpSentenceToken], types: Set[String]) {
	require(tokens.nonEmpty)
	require(types.nonEmpty)
	}

	private final val TOK_ID = "nlpcraft:relation"

	private final val REL_TYPES = Seq(
	"nlpcraft:continent",
	"nlpcraft:subcontinent",
	"nlpcraft:country",
	"nlpcraft:metro",
	"nlpcraft:region",
	"nlpcraft:city",
	"nlpcraft:date"
	)

	private var FUNCS: Seq[Holder] = _
	private var ALL_FUNC_STEMS: Set[String] = _

	/**
	* Starts this component.
	*/
	override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
	val macros = NCMacroParser()

	FUNCS = {
	val seq = mutable.ArrayBuffer.empty[Holder]

	/**
	*
	* @param f Function ID.
	* @param head Mandatory "start" word(s).
	* @param tails All possible "connecting" words (zero or more can be in the sentence).
	*/
	def add(f: String, head: String, tails: Seq[String]): Unit = {
	require(!head.contains(" "))

	val tailsSeq = tails.toSet.map(NCNlpCoreManager.stem)

	macros.expand(head).map(NCNlpCoreManager.stem).foreach(s ⇒ seq += Holder(f, s, tailsSeq + s))
	}

	val tails = Seq(
	"to",
	"and",
	"for",
	"with",
	"in",
	"of",
	"between",
	"statistic",
	"stats",
	"info",
	"metrics",
	"information",
	"data",
	"vs",
	"versus"
	)

	add("compare", "{compare\|comparing\|contrast\|difference\|match\|vs\|versus}", tails)
	add("correlate", "{correlate\|correlation\|relation\|relationship}", tails)

	seq.sortBy(-_.allStems.size)
	}

	ALL_FUNC_STEMS = FUNCS.flatMap(_.allStems).toSet

	super.start()
	}

	override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
	super.stop()
	}

	/**
	* Checks whether important tokens deleted as stopwords or not.
	*
	* @param ns Sentence.
	* @param toks Tokens in which some stopwords can be deleted.
	*/
	private def validImportant(ns: NCNlpSentence, toks: Seq[NCNlpSentenceToken]): Boolean = {
	def isImportant(t: NCNlpSentenceToken): Boolean =
	t.exists(n ⇒ n.isUser \|\| REL_TYPES.contains(n.noteType)) \|\| ALL_FUNC_STEMS.contains(t.stem)

	val idxs = toks.map(_.index)

	require(idxs == idxs.sorted)

	val toks2 = ns.slice(idxs.head, idxs.last + 1)

	toks.length == toks2.length \|\| toks.count(isImportant) == toks2.count(isImportant)
	}

	@throws[NCE]
	override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit =
	startScopedSpan("enrich", parent,
	"srvReqId" → ns.srvReqId,
	"mdlId" → mdl.model.getId,
	"txt" → ns.text) { _ ⇒
	// Tries to grab tokens direct way.
	// Example: A, B, C ⇒ ABC, AB, BC .. (AB will be processed first)
	val notes = mutable.HashSet.empty[NCNlpSentenceNote]

	for (toks ← ns.tokenMixWithStopWords() if validImportant(ns, toks))
	tryToMatch(toks) match {
	case Some(m) ⇒
	for (refNote ← m.refNotes) {
	val note = NCNlpSentenceNote(
	Seq(m.matchedHead.index),
	TOK_ID,
	"type" → m.funcType,
	"indexes" → m.refIndexes,
	"note" → refNote
	)

	if (!notes.exists(n ⇒ ns.notesEqualOrSimilar(n, note))) {
	notes += note

	m.matched.filter(_ != m.matchedHead).foreach(_.addStopReason(note))

	m.matchedHead.add(note)
	}
	}
	case None ⇒ // No-op.
	}
	}

	/**
	*
	* @param toks
	*/
	private def getReference(toks: Seq[NCNlpSentenceToken]): Option[Reference] =
	if (toks.isEmpty)
	None
	else {
	val sortedToks = toks.sortBy(_.index)

	val i1 = sortedToks.head.index
	val i2 = sortedToks.last.index

	val notes =
	sortedToks.flatMap(_.filter(n ⇒ n.isUser \|\| REL_TYPES.contains(n.noteType))).
	// Finds notes for tokens related to only given tokens.
	filter(n ⇒ n.tokenFrom >= i1 && n.tokenTo <= i2)

	val suitNotes =
	notes.
	filter(n1 ⇒ notes.exists(
	n2 ⇒ n2.tokenFrom > n1.tokenTo \|\| n2.tokenTo < n1.tokenFrom)
	).
	groupBy(_.noteType).
	flatMap { case (_, ns) ⇒ if (ns.size > 1) ns else Seq.empty }

	if (suitNotes.nonEmpty)
	Some(
	Reference(
	toks.filter(t ⇒ suitNotes.exists(t.contains)),
	suitNotes.map(_.noteType).toSet
	)
	)
	else
	None
	}

	/**
	*
	* @param toks
	*/
	private def tryToMatch(toks: Seq[NCNlpSentenceToken]): Option[Match] = {
	val i1 = toks.head.index
	val i2 = toks.last.index

	var refOpts = toks.
	filter(t ⇒
	t.exists(n ⇒ (
	n.isUser \|\| REL_TYPES.contains(n.noteType)) &&
	n.tokenIndexes.head >= i1 &&
	n.tokenIndexes.last <= i2
	)
	)
	val matchOpts = toks.diff(refOpts)

	if (refOpts.nonEmpty && matchOpts.nonEmpty)
	getReference(refOpts) match {
	case Some(r) ⇒
	refOpts = r.tokens

	def try0(stems: Set[String]): Option[Match] =
	FUNCS.
	flatMap(h ⇒
	if (stems.subsetOf(h.allStems) && stems.contains(h.headStem)) Some(h) else None
	).
	headOption match {
	case Some(h) ⇒
	Some(
	Match(
	h.funcType,
	matchOpts,
	matchOpts.
	find(_.stem == h.headStem).
	getOrElse(throw new AssertionError("Missed token.")),
	r.types,
	refOpts.map(_.index).asJava
	)
	)
	case None ⇒ None
	}

	try0(matchOpts.map(_.stem).toSet) match {
	case Some(m) ⇒ Some(m)
	case None ⇒ try0(matchOpts.filter(!_.isStopWord).map(_.stem).toSet)
	}

	case None ⇒ None
	}
	else
	None
	}
	}