nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala - incubator-nlpcraft - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      https://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nlpcraft.probe.mgrs

 import org.apache.nlpcraft.common.U
 import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, NCNlpSentenceTokenBuffer}
 import org.apache.nlpcraft.model._
 import org.apache.nlpcraft.model.intent.NCIdlContext
 import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
 import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind._

 import scala.collection.mutable

 /**
   *
   * @param isElementId Is this an implicit element ID synonym?
   *     In this case chunks contain the element ID.
   * @param isValueName Is this an implicit value name synonym?
   *     In this case chunks contain value name.
   * @param isDirect Direct or permuted synonym flag.
   * @param value Optional value name if this is a value synonym.
   * @param sparse Flag.
   * @param permute Flag.
   */
 class NCProbeSynonym(
     val isElementId: Boolean,
     val isValueName: Boolean,
     val isDirect: Boolean,
     val value: String = null,
     val sparse: Boolean,
     val permute: Boolean
 ) extends mutable.ArrayBuffer[NCProbeSynonymChunk] with Ordered[NCProbeSynonym] {
     require((isElementId && !isValueName && value == null) || !isElementId)
     require((isValueName && value != null) || !isValueName)

     lazy val isTextOnly: Boolean = forall(_.kind == TEXT)
     lazy val regexChunks: Int = count(_.kind == REGEX)
     lazy val idlChunks: Int = count(_.kind == IDL)
     lazy val hasIdl: Boolean = idlChunks != 0
     lazy val isValueSynonym: Boolean = value != null
     lazy val stems: String = map(_.wordStem).mkString(" ")
     lazy val stemsHash: Int = stems.hashCode

     /**
       *
       * @param kind
       * @return
       */
     private def getSort(kind: NCSynonymChunkKind): Int =
         kind match {
             case TEXT => 0
             case IDL => 1
             case REGEX => 2
             case _ => throw new AssertionError(s"Unexpected kind: $kind")
         }

     /**
       *
       * @param tok
       * @param chunk
       */
     private def isMatch(tok: NCNlpSentenceToken, chunk: NCProbeSynonymChunk): Boolean =
         chunk.kind match {
             case TEXT => chunk.wordStem == tok.stem
             case REGEX =>
                 val regex = chunk.regex

                 regex.matcher(tok.origText).matches() || regex.matcher(tok.normText).matches()
             case IDL => throw new AssertionError()
             case _ => throw new AssertionError()
         }

     /**
       *
       * @param toks
       * @param isMatch
       * @param getIndex
       * @param shouldBeNeighbors
       * @tparam T
       * @return
       */
     private def sparseMatch0[T](
         toks: Seq[T],
         isMatch: (T, NCProbeSynonymChunk) => Boolean,
         getIndex: T => Int,
         shouldBeNeighbors: Boolean
     ): Option[Seq[T]] =
         if (toks.size >= this.size) {
             lazy val res = mutable.ArrayBuffer.empty[T]
             lazy val all = mutable.HashSet.empty[T]

             var state = 0

             for (chunk <- this if state != -1) {
                 val seq =
                     if (state == 0) {
                         state = 1

                         toks.filter(t => isMatch(t, chunk))
                     }
                     else
                         toks.filter(t => !res.contains(t) && isMatch(t, chunk))

                 if (seq.nonEmpty) {
                     val head = seq.head

                     if (!permute && res.nonEmpty && getIndex(head) <= getIndex(res.last))
                         state = -1
                     else {
                         all ++= seq

                         if (all.size > this.size)
                             state = -1
                         else
                             res += head
                     }
                 }
                 else
                     state = -1
             }

             if (state != -1 && all.size == res.size && (!shouldBeNeighbors || U.isIncreased(res.map(getIndex).toSeq.sorted)))
                 Some(res.toSeq)
             else
                 None
         }
         else
             None

     /**
       *
       * @param tow
       * @param chunk
       * @param req
       */
     private def isMatch(tow: NCIdlContent, chunk: NCProbeSynonymChunk, req: NCRequest): Boolean = {
         def get0[T](fromToken: NCToken => T, fromWord: NCNlpSentenceToken => T): T =
             if (tow.isLeft) fromToken(tow.swap.toOption.get) else fromWord(tow.toOption.get)

         chunk.kind match {
             case TEXT => chunk.wordStem == get0(_.stem, _.stem)

             case REGEX =>
                 val r = chunk.regex

                 r.matcher(get0(_.origText, _.origText)).matches() || r.matcher(get0(_.normText, _.normText)).matches()

             case IDL =>
                 get0(t => chunk.idlPred.apply(t, NCIdlContext(req = req)).value.asInstanceOf[Boolean], _ => false)

             case _ => throw new AssertionError()
         }
     }

     /**
       *
       * @param toks
       */
     def isMatch(toks: NCNlpSentenceTokenBuffer): Boolean = {
         require(toks != null)
         require(!sparse && !hasIdl)

         if (toks.length == length) {
             if (isTextOnly)
                 toks.stemsHash == stemsHash && toks.stems == stems
             else
                 toks.zip(this).sortBy(p => getSort(p._2.kind)).forall { case (tok, chunk) => isMatch(tok, chunk) }
         }
         else
             false
     }

     /**
       *
       * @param tows
       * @param req
       * @return
       */
     def isMatch(tows: Seq[NCIdlContent], req: NCRequest): Boolean = {
         require(tows != null)

         if (tows.length == length && tows.count(_.isLeft) >= idlChunks)
             tows.zip(this).sortBy(p => getSort(p._2.kind)).forall { case (tow, chunk) => isMatch(tow, chunk, req) }
         else
             false
     }

     /**
       *
       * @param toks
       */
     def sparseMatch(toks: NCNlpSentenceTokenBuffer): Option[Seq[NCNlpSentenceToken]] = {
         require(toks != null)
         require(sparse && !hasIdl)

         sparseMatch0(toks.toSeq, isMatch, (t: NCNlpSentenceToken) => t.startCharIndex, shouldBeNeighbors = false)
     }

     /**
       *
       * @param tows
       * @param req
       */
     def sparseMatch(tows: Seq[NCIdlContent], req: NCRequest): Option[Seq[NCIdlContent]] = {
         require(tows != null)
         require(req != null)
         require(hasIdl)

         sparseMatch0(
             tows,
             (t: NCIdlContent, chunk: NCProbeSynonymChunk) => isMatch(t, chunk, req),
             (t: NCIdlContent) => if (t.isLeft) t.swap.toOption.get.getStartCharIndex else t.toOption.get.startCharIndex,
             shouldBeNeighbors = !sparse
         )
     }

     override def toString(): String = mkString(" ")

     // Orders synonyms from least to most significant.
     override def compare(that: NCProbeSynonym): Int = {
         require(hasIdl || sparse == that.sparse, s"Invalid comparing [this=$this, that=$that]")

         def compareIsValueSynonym(): Int =
             isValueSynonym match {
                 case true if !that.isValueSynonym => 1
                 case false if that.isValueSynonym => -1

                 case _ => 0
             }

         if (that == null)
             1
         else
             isElementId match {
                 case true if !that.isElementId => 1
                 case false if that.isElementId => -1
                 case true if that.isElementId => 0

                 case _ => // None are element IDs.
                     if (length > that.length)
                         1
                     else if (length < that.length)
                         -1
                     else { // Equal length in chunks.
                         if (isDirect && !that.isDirect)
                             1
                         else if (!isDirect && that.isDirect)
                             -1
                         else if (permute && !that.permute)
                             -1
                         else if (!permute && that.permute)
                             1
                         else // Both direct or indirect.
                             isTextOnly match {
                                 case true if !that.isTextOnly => 1
                                 case false if that.isTextOnly => -1
                                 case true if that.isTextOnly => compareIsValueSynonym()
                                 case _ =>
                                     val thisDynCnt = regexChunks + idlChunks
                                     val thatDynCnt = that.regexChunks + that.idlChunks

                                     // Less PoS/regex/IDL chunks means less uncertainty, i.e. larger weight.
                                     if (thisDynCnt < thatDynCnt)
                                         1
                                     else if (thisDynCnt > thatDynCnt)
                                         -1
                                     else
                                         0
                             }
                     }
             }
     }

     override def canEqual(other: Any): Boolean = other.isInstanceOf[NCProbeSynonym]

     override def equals(other: Any): Boolean = other match {
         case that: NCProbeSynonym =>
             super.equals(that) &&
                 (that canEqual this) &&
                 isTextOnly == that.isTextOnly &&
                 regexChunks == that.regexChunks &&
                 idlChunks == that.idlChunks &&
                 isValueSynonym == that.isValueSynonym &&
                 isElementId == that.isElementId &&
                 isValueName == that.isValueName &&
                 value == that.value
         case _ => false
     }

     override def hashCode(): Int = {
         val state = Seq(
             super.hashCode(),
             isTextOnly,
             regexChunks,
             idlChunks,
             isValueSynonym,
             isElementId,
             isValueName,
             value
         )

         state.map(p => if (p == null) 0 else p.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
     }
 }

 object NCProbeSynonym {
     type NCIdlContent = Either[NCToken, NCNlpSentenceToken]

     /**
       *
       * @param isElementId
       * @param isValueName
       * @param isDirect
       * @param value
       * @param chunks
       * @param sparse
       * @param permute
       */
     def apply(
         isElementId: Boolean,
         isValueName: Boolean,
         isDirect: Boolean,
         value: String,
         chunks: Seq[NCProbeSynonymChunk],
         sparse: Boolean,
         permute: Boolean
     ): NCProbeSynonym = {
         val syn = new NCProbeSynonym(isElementId, isValueName, isDirect, value, sparse, permute)

         syn ++= chunks

         syn
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* https://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nlpcraft.probe.mgrs

	import org.apache.nlpcraft.common.U
	import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, NCNlpSentenceTokenBuffer}
	import org.apache.nlpcraft.model._
	import org.apache.nlpcraft.model.intent.NCIdlContext
	import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
	import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind._

	import scala.collection.mutable

	/**
	*
	* @param isElementId Is this an implicit element ID synonym?
	* In this case chunks contain the element ID.
	* @param isValueName Is this an implicit value name synonym?
	* In this case chunks contain value name.
	* @param isDirect Direct or permuted synonym flag.
	* @param value Optional value name if this is a value synonym.
	* @param sparse Flag.
	* @param permute Flag.
	*/
	class NCProbeSynonym(
	val isElementId: Boolean,
	val isValueName: Boolean,
	val isDirect: Boolean,
	val value: String = null,
	val sparse: Boolean,
	val permute: Boolean
	) extends mutable.ArrayBuffer[NCProbeSynonymChunk] with Ordered[NCProbeSynonym] {
	require((isElementId && !isValueName && value == null) \|\| !isElementId)
	require((isValueName && value != null) \|\| !isValueName)

	lazy val isTextOnly: Boolean = forall(_.kind == TEXT)
	lazy val regexChunks: Int = count(_.kind == REGEX)
	lazy val idlChunks: Int = count(_.kind == IDL)
	lazy val hasIdl: Boolean = idlChunks != 0
	lazy val isValueSynonym: Boolean = value != null
	lazy val stems: String = map(_.wordStem).mkString(" ")
	lazy val stemsHash: Int = stems.hashCode

	/**
	*
	* @param kind
	* @return
	*/
	private def getSort(kind: NCSynonymChunkKind): Int =
	kind match {
	case TEXT => 0
	case IDL => 1
	case REGEX => 2
	case _ => throw new AssertionError(s"Unexpected kind: $kind")
	}

	/**
	*
	* @param tok
	* @param chunk
	*/
	private def isMatch(tok: NCNlpSentenceToken, chunk: NCProbeSynonymChunk): Boolean =
	chunk.kind match {
	case TEXT => chunk.wordStem == tok.stem
	case REGEX =>
	val regex = chunk.regex

	regex.matcher(tok.origText).matches() \|\| regex.matcher(tok.normText).matches()
	case IDL => throw new AssertionError()
	case _ => throw new AssertionError()
	}

	/**
	*
	* @param toks
	* @param isMatch
	* @param getIndex
	* @param shouldBeNeighbors
	* @tparam T
	* @return
	*/
	private def sparseMatch0[T](
	toks: Seq[T],
	isMatch: (T, NCProbeSynonymChunk) => Boolean,
	getIndex: T => Int,
	shouldBeNeighbors: Boolean
	): Option[Seq[T]] =
	if (toks.size >= this.size) {
	lazy val res = mutable.ArrayBuffer.empty[T]
	lazy val all = mutable.HashSet.empty[T]

	var state = 0

	for (chunk <- this if state != -1) {
	val seq =
	if (state == 0) {
	state = 1

	toks.filter(t => isMatch(t, chunk))
	}
	else
	toks.filter(t => !res.contains(t) && isMatch(t, chunk))

	if (seq.nonEmpty) {
	val head = seq.head

	if (!permute && res.nonEmpty && getIndex(head) <= getIndex(res.last))
	state = -1
	else {
	all ++= seq

	if (all.size > this.size)
	state = -1
	else
	res += head
	}
	}
	else
	state = -1
	}

	if (state != -1 && all.size == res.size && (!shouldBeNeighbors \|\| U.isIncreased(res.map(getIndex).toSeq.sorted)))
	Some(res.toSeq)
	else
	None
	}
	else
	None

	/**
	*
	* @param tow
	* @param chunk
	* @param req
	*/
	private def isMatch(tow: NCIdlContent, chunk: NCProbeSynonymChunk, req: NCRequest): Boolean = {
	def get0[T](fromToken: NCToken => T, fromWord: NCNlpSentenceToken => T): T =
	if (tow.isLeft) fromToken(tow.swap.toOption.get) else fromWord(tow.toOption.get)

	chunk.kind match {
	case TEXT => chunk.wordStem == get0(_.stem, _.stem)

	case REGEX =>
	val r = chunk.regex

	r.matcher(get0(_.origText, _.origText)).matches() \|\| r.matcher(get0(_.normText, _.normText)).matches()

	case IDL =>
	get0(t => chunk.idlPred.apply(t, NCIdlContext(req = req)).value.asInstanceOf[Boolean], _ => false)

	case _ => throw new AssertionError()
	}
	}

	/**
	*
	* @param toks
	*/
	def isMatch(toks: NCNlpSentenceTokenBuffer): Boolean = {
	require(toks != null)
	require(!sparse && !hasIdl)

	if (toks.length == length) {
	if (isTextOnly)
	toks.stemsHash == stemsHash && toks.stems == stems
	else
	toks.zip(this).sortBy(p => getSort(p._2.kind)).forall { case (tok, chunk) => isMatch(tok, chunk) }
	}
	else
	false
	}

	/**
	*
	* @param tows
	* @param req
	* @return
	*/
	def isMatch(tows: Seq[NCIdlContent], req: NCRequest): Boolean = {
	require(tows != null)

	if (tows.length == length && tows.count(_.isLeft) >= idlChunks)
	tows.zip(this).sortBy(p => getSort(p._2.kind)).forall { case (tow, chunk) => isMatch(tow, chunk, req) }
	else
	false
	}

	/**
	*
	* @param toks
	*/
	def sparseMatch(toks: NCNlpSentenceTokenBuffer): Option[Seq[NCNlpSentenceToken]] = {
	require(toks != null)
	require(sparse && !hasIdl)

	sparseMatch0(toks.toSeq, isMatch, (t: NCNlpSentenceToken) => t.startCharIndex, shouldBeNeighbors = false)
	}

	/**
	*
	* @param tows
	* @param req
	*/
	def sparseMatch(tows: Seq[NCIdlContent], req: NCRequest): Option[Seq[NCIdlContent]] = {
	require(tows != null)
	require(req != null)
	require(hasIdl)

	sparseMatch0(
	tows,
	(t: NCIdlContent, chunk: NCProbeSynonymChunk) => isMatch(t, chunk, req),
	(t: NCIdlContent) => if (t.isLeft) t.swap.toOption.get.getStartCharIndex else t.toOption.get.startCharIndex,
	shouldBeNeighbors = !sparse
	)
	}

	override def toString(): String = mkString(" ")

	// Orders synonyms from least to most significant.
	override def compare(that: NCProbeSynonym): Int = {
	require(hasIdl \|\| sparse == that.sparse, s"Invalid comparing [this=$this, that=$that]")

	def compareIsValueSynonym(): Int =
	isValueSynonym match {
	case true if !that.isValueSynonym => 1
	case false if that.isValueSynonym => -1

	case _ => 0
	}

	if (that == null)
	1
	else
	isElementId match {
	case true if !that.isElementId => 1
	case false if that.isElementId => -1
	case true if that.isElementId => 0

	case _ => // None are element IDs.
	if (length > that.length)
	1
	else if (length < that.length)
	-1
	else { // Equal length in chunks.
	if (isDirect && !that.isDirect)
	1
	else if (!isDirect && that.isDirect)
	-1
	else if (permute && !that.permute)
	-1
	else if (!permute && that.permute)
	1
	else // Both direct or indirect.
	isTextOnly match {
	case true if !that.isTextOnly => 1
	case false if that.isTextOnly => -1
	case true if that.isTextOnly => compareIsValueSynonym()
	case _ =>
	val thisDynCnt = regexChunks + idlChunks
	val thatDynCnt = that.regexChunks + that.idlChunks

	// Less PoS/regex/IDL chunks means less uncertainty, i.e. larger weight.
	if (thisDynCnt < thatDynCnt)
	1
	else if (thisDynCnt > thatDynCnt)
	-1
	else
	0
	}
	}
	}
	}

	override def canEqual(other: Any): Boolean = other.isInstanceOf[NCProbeSynonym]

	override def equals(other: Any): Boolean = other match {
	case that: NCProbeSynonym =>
	super.equals(that) &&
	(that canEqual this) &&
	isTextOnly == that.isTextOnly &&
	regexChunks == that.regexChunks &&
	idlChunks == that.idlChunks &&
	isValueSynonym == that.isValueSynonym &&
	isElementId == that.isElementId &&
	isValueName == that.isValueName &&
	value == that.value
	case _ => false
	}

	override def hashCode(): Int = {
	val state = Seq(
	super.hashCode(),
	isTextOnly,
	regexChunks,
	idlChunks,
	isValueSynonym,
	isElementId,
	isValueName,
	value
	)

	state.map(p => if (p == null) 0 else p.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
	}
	}

	object NCProbeSynonym {
	type NCIdlContent = Either[NCToken, NCNlpSentenceToken]

	/**
	*
	* @param isElementId
	* @param isValueName
	* @param isDirect
	* @param value
	* @param chunks
	* @param sparse
	* @param permute
	*/
	def apply(
	isElementId: Boolean,
	isValueName: Boolean,
	isDirect: Boolean,
	value: String,
	chunks: Seq[NCProbeSynonymChunk],
	sparse: Boolean,
	permute: Boolean
	): NCProbeSynonym = {
	val syn = new NCProbeSynonym(isElementId, isValueName, isDirect, value, sparse, permute)

	syn ++= chunks

	syn
	}
	}