nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala - incubator-nlpcraft - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nlpcraft.probe.mgrs

 import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, NCNlpSentenceTokenBuffer}
 import org.apache.nlpcraft.model._
 import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind._

 import scala.collection.mutable.ArrayBuffer

 /**
   *
   * @param isElementId Is this an implicit element ID synonym?
   *     In this case chunks contain the element ID.
   * @param isValueName Is this an implicit value name synonym?
   *     In this case chunks contain value name.
   * @param isDirect Direct or permuted synonym flag.
   * @param value Optional value name if this is a value synonym.
   */
 class NCProbeSynonym(
     val isElementId: Boolean,
     val isValueName: Boolean,
     val isDirect: Boolean,
     val value: String = null
 ) extends ArrayBuffer[NCProbeSynonymChunk] with Ordered[NCProbeSynonym] {
     require((isElementId && !isValueName && value == null) || !isElementId)
     require((isValueName && value != null) || !isValueName)

     lazy val isTextOnly: Boolean = forall(_.kind == TEXT)
     lazy val regexChunks: Int = count(_.kind == REGEX)
     lazy val dslChunks: Int = count(_.kind == DSL)
     lazy val isValueSynonym: Boolean = value != null
     lazy val stems: String = map(_.wordStem).mkString(" ")
     lazy val stemsHash: Int = stems.hashCode

     /**
       *
       * @param kind
       * @return
       */
     private def getSort(kind: NCSynonymChunkKind): Int =
         kind match {
             case TEXT ⇒ 0
             case DSL ⇒ 1
             case REGEX ⇒ 2
             case _ ⇒ throw new AssertionError(s"Unexpected kind: $kind")
         }

     /**
       *
       * @param toks
       * @return
       */
     def isMatch(toks: NCNlpSentenceTokenBuffer): Boolean = {
         require(toks != null)

         val ok =
             if (isTextOnly)
                 toks.stemsHash == stemsHash && toks.stems == stems
             else
                 // Same length.
                 toks.zip(this).sortBy(p ⇒ getSort(p._2.kind)).forall {
                     case (tok, chunk) ⇒
                         chunk.kind match {
                             case TEXT ⇒ chunk.wordStem == tok.stem
                             case REGEX ⇒ chunk.regex.matcher(tok.origText).matches() || chunk.regex.matcher(tok.normText).matches()
                             case DSL ⇒ throw new AssertionError()
                             case _ ⇒ throw new AssertionError()
                         }
                 }

         // Should be called only for valid tokens count (validation optimized for performance reasons)
         ok && toks.length == length
     }

     /**
       *
       * @param tows
       * @return
       */
     def isMatch(tows: Seq[Either[NCToken, NCNlpSentenceToken]]): Boolean = {
         require(tows != null)

         type Token = NCToken
         type Word = NCNlpSentenceToken
         type TokenOrWord = Either[Token, Word]

         val ok =
             // Same length.
             tows.zip(this).sortBy(p ⇒ getSort(p._2.kind)).forall {
                 case (tow, chunk) ⇒
                     def get0[T](fromToken: Token ⇒ T, fromWord: Word ⇒ T): T =
                         if (tow.isLeft) fromToken(tow.left.get) else fromWord(tow.right.get)

                     chunk.kind match {
                         case TEXT ⇒ chunk.wordStem == get0((t: Token) ⇒ t.stem, (w: Word) ⇒ w.stem)
                         case REGEX ⇒
                             val r = chunk.regex

                             r.matcher(get0((t: Token) ⇒ t.origText, (w: Word) ⇒ w.origText)).matches() ||
                             r.matcher(get0((t: Token) ⇒ t.normText, (w: Word) ⇒ w.normText)).matches()
                         case DSL ⇒ get0((t: Token) ⇒ chunk.dslPred.apply(t), (_: Word) ⇒ false)

                         case _ ⇒ throw new AssertionError()
                     }
             }
         // Should be called only for valid tokens count (validation optimized for performance reasons)
         ok && tows.length == length
     }

     override def toString(): String = mkString(" ")

     // Orders synonyms from least to most significant.
     override def compare(that: NCProbeSynonym): Int = {
         def compareIsValueSynonym(): Int =
             isValueSynonym match {
                 case true if !that.isValueSynonym ⇒ 1
                 case false if that.isValueSynonym ⇒ -1

                 case _ ⇒ 0
             }

         if (that == null)
             1
         else
             isElementId match {
                 case true if !that.isElementId ⇒ 1
                 case false if that.isElementId ⇒ -1
                 case true if that.isElementId ⇒ 0

                 case _ ⇒ // None are element IDs.
                     if (length > that.length)
                         1
                     else if (length < that.length)
                         -1
                     else { // Equal length in chunks.
                         if (isDirect && !that.isDirect)
                             1
                         else if (!isDirect && that.isDirect)
                             -1
                         else // Both direct or indirect.
                             isTextOnly match {
                                 case true if !that.isTextOnly ⇒ 1
                                 case false if that.isTextOnly ⇒ -1
                                 case true if that.isTextOnly ⇒ compareIsValueSynonym()
                                 case _ ⇒
                                     val thisDynCnt = regexChunks + dslChunks
                                     val thatDynCnt = that.regexChunks + that.dslChunks

                                     // Less PoS/regex/DSL chunks means less uncertainty, i.e. larger weight.
                                     if (thisDynCnt < thatDynCnt)
                                         1
                                     else if (thisDynCnt > thatDynCnt)
                                         -1
                                     else
                                         0
                             }
                     }
             }
     }

     override def canEqual(other: Any): Boolean = other.isInstanceOf[NCProbeSynonym]

     override def equals(other: Any): Boolean = other match {
         case that: NCProbeSynonym ⇒
             super.equals(that) &&
                 (that canEqual this) &&
                 isTextOnly == that.isTextOnly &&
                 regexChunks == that.regexChunks &&
                 dslChunks == that.dslChunks &&
                 isValueSynonym == that.isValueSynonym &&
                 isElementId == that.isElementId &&
                 isValueName == that.isValueName &&
                 value == that.value
         case _ ⇒ false
     }

     override def hashCode(): Int = {
         val state = Seq(
             super.hashCode(),
             isTextOnly,
             regexChunks,
             dslChunks,
             isValueSynonym,
             isElementId,
             isValueName,
             value
         )

         state.map(p ⇒ if (p == null) 0 else p.hashCode()).foldLeft(0)((a, b) ⇒ 31 * a + b)
     }
 }

 object NCProbeSynonym {
     /**
       *
       * @param isElementId
       * @param isValueName
       * @param isDirect
       * @param value
       * @param chunks
       * @return
       */
     def apply(isElementId: Boolean, isValueName: Boolean, isDirect: Boolean, value: String, chunks: Seq[NCProbeSynonymChunk]): NCProbeSynonym = {
         var syn = new NCProbeSynonym(isElementId, isValueName, isDirect, value)

         syn ++= chunks

         syn
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nlpcraft.probe.mgrs

	import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, NCNlpSentenceTokenBuffer}
	import org.apache.nlpcraft.model._
	import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind._

	import scala.collection.mutable.ArrayBuffer

	/**
	*
	* @param isElementId Is this an implicit element ID synonym?
	* In this case chunks contain the element ID.
	* @param isValueName Is this an implicit value name synonym?
	* In this case chunks contain value name.
	* @param isDirect Direct or permuted synonym flag.
	* @param value Optional value name if this is a value synonym.
	*/
	class NCProbeSynonym(
	val isElementId: Boolean,
	val isValueName: Boolean,
	val isDirect: Boolean,
	val value: String = null
	) extends ArrayBuffer[NCProbeSynonymChunk] with Ordered[NCProbeSynonym] {
	require((isElementId && !isValueName && value == null) \|\| !isElementId)
	require((isValueName && value != null) \|\| !isValueName)

	lazy val isTextOnly: Boolean = forall(_.kind == TEXT)
	lazy val regexChunks: Int = count(_.kind == REGEX)
	lazy val dslChunks: Int = count(_.kind == DSL)
	lazy val isValueSynonym: Boolean = value != null
	lazy val stems: String = map(_.wordStem).mkString(" ")
	lazy val stemsHash: Int = stems.hashCode

	/**
	*
	* @param kind
	* @return
	*/
	private def getSort(kind: NCSynonymChunkKind): Int =
	kind match {
	case TEXT ⇒ 0
	case DSL ⇒ 1
	case REGEX ⇒ 2
	case _ ⇒ throw new AssertionError(s"Unexpected kind: $kind")
	}

	/**
	*
	* @param toks
	* @return
	*/
	def isMatch(toks: NCNlpSentenceTokenBuffer): Boolean = {
	require(toks != null)

	val ok =
	if (isTextOnly)
	toks.stemsHash == stemsHash && toks.stems == stems
	else
	// Same length.
	toks.zip(this).sortBy(p ⇒ getSort(p._2.kind)).forall {
	case (tok, chunk) ⇒
	chunk.kind match {
	case TEXT ⇒ chunk.wordStem == tok.stem
	case REGEX ⇒ chunk.regex.matcher(tok.origText).matches() \|\| chunk.regex.matcher(tok.normText).matches()
	case DSL ⇒ throw new AssertionError()
	case _ ⇒ throw new AssertionError()
	}
	}

	// Should be called only for valid tokens count (validation optimized for performance reasons)
	ok && toks.length == length
	}

	/**
	*
	* @param tows
	* @return
	*/
	def isMatch(tows: Seq[Either[NCToken, NCNlpSentenceToken]]): Boolean = {
	require(tows != null)

	type Token = NCToken
	type Word = NCNlpSentenceToken
	type TokenOrWord = Either[Token, Word]

	val ok =
	// Same length.
	tows.zip(this).sortBy(p ⇒ getSort(p._2.kind)).forall {
	case (tow, chunk) ⇒
	def get0[T](fromToken: Token ⇒ T, fromWord: Word ⇒ T): T =
	if (tow.isLeft) fromToken(tow.left.get) else fromWord(tow.right.get)

	chunk.kind match {
	case TEXT ⇒ chunk.wordStem == get0((t: Token) ⇒ t.stem, (w: Word) ⇒ w.stem)
	case REGEX ⇒
	val r = chunk.regex

	r.matcher(get0((t: Token) ⇒ t.origText, (w: Word) ⇒ w.origText)).matches() \|\|
	r.matcher(get0((t: Token) ⇒ t.normText, (w: Word) ⇒ w.normText)).matches()
	case DSL ⇒ get0((t: Token) ⇒ chunk.dslPred.apply(t), (_: Word) ⇒ false)

	case _ ⇒ throw new AssertionError()
	}
	}
	// Should be called only for valid tokens count (validation optimized for performance reasons)
	ok && tows.length == length
	}

	override def toString(): String = mkString(" ")

	// Orders synonyms from least to most significant.
	override def compare(that: NCProbeSynonym): Int = {
	def compareIsValueSynonym(): Int =
	isValueSynonym match {
	case true if !that.isValueSynonym ⇒ 1
	case false if that.isValueSynonym ⇒ -1

	case _ ⇒ 0
	}

	if (that == null)
	1
	else
	isElementId match {
	case true if !that.isElementId ⇒ 1
	case false if that.isElementId ⇒ -1
	case true if that.isElementId ⇒ 0

	case _ ⇒ // None are element IDs.
	if (length > that.length)
	1
	else if (length < that.length)
	-1
	else { // Equal length in chunks.
	if (isDirect && !that.isDirect)
	1
	else if (!isDirect && that.isDirect)
	-1
	else // Both direct or indirect.
	isTextOnly match {
	case true if !that.isTextOnly ⇒ 1
	case false if that.isTextOnly ⇒ -1
	case true if that.isTextOnly ⇒ compareIsValueSynonym()
	case _ ⇒
	val thisDynCnt = regexChunks + dslChunks
	val thatDynCnt = that.regexChunks + that.dslChunks

	// Less PoS/regex/DSL chunks means less uncertainty, i.e. larger weight.
	if (thisDynCnt < thatDynCnt)
	1
	else if (thisDynCnt > thatDynCnt)
	-1
	else
	0
	}
	}
	}
	}

	override def canEqual(other: Any): Boolean = other.isInstanceOf[NCProbeSynonym]

	override def equals(other: Any): Boolean = other match {
	case that: NCProbeSynonym ⇒
	super.equals(that) &&
	(that canEqual this) &&
	isTextOnly == that.isTextOnly &&
	regexChunks == that.regexChunks &&
	dslChunks == that.dslChunks &&
	isValueSynonym == that.isValueSynonym &&
	isElementId == that.isElementId &&
	isValueName == that.isValueName &&
	value == that.value
	case _ ⇒ false
	}

	override def hashCode(): Int = {
	val state = Seq(
	super.hashCode(),
	isTextOnly,
	regexChunks,
	dslChunks,
	isValueSynonym,
	isElementId,
	isValueName,
	value
	)

	state.map(p ⇒ if (p == null) 0 else p.hashCode()).foldLeft(0)((a, b) ⇒ 31 * a + b)
	}
	}

	object NCProbeSynonym {
	/**
	*
	* @param isElementId
	* @param isValueName
	* @param isDirect
	* @param value
	* @param chunks
	* @return
	*/
	def apply(isElementId: Boolean, isValueName: Boolean, isDirect: Boolean, value: String, chunks: Seq[NCProbeSynonymChunk]): NCProbeSynonym = {
	var syn = new NCProbeSynonym(isElementId, isValueName, isDirect, value)

	syn ++= chunks

	syn
	}
	}