Parts detection fixed.

commit: 60bedc363851814310d251e29de4bbe45f0e3128 [log] [tgz]
author: Sergey Kamov <skhdlemail@gmail.com> Thu Mar 04 16:14:38 2021 +0300
committer: Sergey Kamov <skhdlemail@gmail.com> Thu Mar 04 16:14:38 2021 +0300
tree: 009d6258b2bda0ff25096cf5dac9d60ed37fe9cb
parent: a13ca5db2c3c3216971715a4d45fb10ee1ddb134 [diff]
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
index 6ed8f44..b8b7dc6 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala

@@ -69,11 +69,10 @@
     def isMatch(toks: NCNlpSentenceTokenBuffer): Boolean = {
         require(toks != null)
 
-        val ok =
+        if (toks.length == length) {
             if (isTextOnly)
                 toks.stemsHash == stemsHash && toks.stems == stems
             else
-                // Same length.
                 toks.zip(this).sortBy(p ⇒ getSort(p._2.kind)).forall {
                     case (tok, chunk) ⇒
                         chunk.kind match {
@@ -83,9 +82,9 @@
                             case _ ⇒ throw new AssertionError()
                         }
                 }
-
-        // Should be called only for valid tokens count (validation optimized for performance reasons)
-        ok && toks.length == length
+        }
+        else
+            false
     }
 
     /**
@@ -100,27 +99,26 @@
         type Word = NCNlpSentenceToken
         type TokenOrWord = Either[Token, Word]
 
-        val ok =
-            // Same length.
+        if (tows.length == length && tows.count(_.isLeft) >= dslChunks)
             tows.zip(this).sortBy(p ⇒ getSort(p._2.kind)).forall {
                 case (tow, chunk) ⇒
                     def get0[T](fromToken: Token ⇒ T, fromWord: Word ⇒ T): T =
                         if (tow.isLeft) fromToken(tow.left.get) else fromWord(tow.right.get)
 
                     chunk.kind match {
-                        case TEXT ⇒ chunk.wordStem == get0((t: Token) ⇒ t.stem, (w: Word) ⇒ w.stem)
+                        case TEXT ⇒ chunk.wordStem == get0(_.stem, _.stem)
                         case REGEX ⇒
                             val r = chunk.regex
 
-                            r.matcher(get0((t: Token) ⇒ t.origText, (w: Word) ⇒ w.origText)).matches() ||
-                            r.matcher(get0((t: Token) ⇒ t.normText, (w: Word) ⇒ w.normText)).matches()
-                        case DSL ⇒ get0((t: Token) ⇒ chunk.dslPred.apply(t), (_: Word) ⇒ false)
+                            r.matcher(get0(_.origText, _.origText)).matches() ||
+                            r.matcher(get0(_.normText, _.normText)).matches()
+                        case DSL ⇒ get0(t ⇒ chunk.dslPred.apply(t), _ ⇒ false)
 
                         case _ ⇒ throw new AssertionError()
                     }
             }
-        // Should be called only for valid tokens count (validation optimized for performance reasons)
-        ok && tows.length == length
+        else
+            false
     }
     
     override def toString(): String = mkString(" ")

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
index a68e305..21eaaab 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala

@@ -503,7 +503,7 @@
             solver = solver,
             intents = intents.keySet.toSeq,
             synonyms = mkFastAccessMap(filter(syns, dsl = false), NCProbeSynonymsWrapper(_)),
-            synonymsDsl = mkFastAccessMap(filter(syns, dsl = true), seq ⇒ seq),
+            synonymsDsl = mkFastAccessMap(filter(syns, dsl = true), _.sorted.reverse),
             addStopWordsStems = addStopWords.toSet,
             exclStopWordsStems = exclStopWords.toSet,
             suspWordsStems = suspWords.toSet,

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index c4a7936..2a9dec0 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala

@@ -17,21 +17,22 @@
 
 package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
 
-import java.io.Serializable
-import java.util
 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common._
 import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, NCNlpSentenceTokenBuffer, _}
 import org.apache.nlpcraft.model._
+import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, TEXT}
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
-import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym, NCProbeSynonymsWrapper, NCProbeVariants}
+import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym, NCProbeVariants}
 
+import java.io.Serializable
+import java.util
 import scala.collection.JavaConverters._
-import scala.compat.java8.OptionConverters._
 import scala.collection.convert.DecorateAsScala
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.{Map, Seq, mutable}
+import scala.compat.java8.OptionConverters._
 
 /**
   * Model elements enricher.
@@ -63,7 +64,7 @@
         element: NCElement,
         tokens: Seq[NCNlpSentenceToken],
         synonym: NCProbeSynonym,
-        parts: Seq[NCToken]
+        parts: Seq[(NCToken, NCSynonymChunkKind)]
     ) extends Ordered[ElementMatch] {
         // Tokens sparsity.
         lazy val sparsity: Int = tokens.zipWithIndex.tail.map {
@@ -198,7 +199,7 @@
         direct: Boolean,
         syn: Option[NCProbeSynonym],
         metaOpt: Option[Map[String, Object]],
-        parts: Seq[NCToken]
+        parts: Seq[(NCToken, NCSynonymChunkKind)]
     ): Unit = {
         val params = mutable.ArrayBuffer.empty[(String, AnyRef)]
 
@@ -219,16 +220,16 @@
 
         if (parts.nonEmpty) {
             val partsData: Seq[util.HashMap[String, Any]] =
-                parts.map(part ⇒ {
+                parts.map { case (part, kind) ⇒
                     val m = new util.HashMap[String, Any]()
 
-                    m.put("id", part.getId)
+                    m.put("id", if (kind == TEXT) "nlpcraft:nlp" else part.getId)
                     m.put("startcharindex", part.getStartCharIndex)
                     m.put("endcharindex", part.getEndCharIndex)
                     m.put(TOK_META_ALIASES_KEY, part.getMetadata.get(TOK_META_ALIASES_KEY))
 
                     m
-                })
+                }
 
             params += "parts" → partsData.asJava
         }
@@ -262,7 +263,7 @@
       * @param toks
       * @return
       */
-    protected def combos[T](toks: Seq[T]): Seq[Seq[T]] =
+    private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
         (for (n ← toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p ⇒ p)
 
     /**
@@ -295,7 +296,7 @@
                     varToks.flatMap(t ⇒
                         // Single word token is not split as words - token.
                         // Partly (not strict in) token - word.
-                        if ((toksComb.contains(t) || isSingleWord(t)) && inStrict(t))
+                        if (inStrict(t) && (toksComb.contains(t) || isSingleWord(t)))
                             Seq(Complex(Left(t)))
                         else
                             t.wordIndexes.filter(nlpWordIdxs.contains).map(i ⇒ Complex(Right(initialSen(i))))
@@ -354,7 +355,7 @@
                 toks.map(t ⇒ (t.origText, t.index)).mkString(" ")
 
             var permCnt = 0
-            var collapsedSens: Seq[Seq[NCToken]] = null
+            lazy val collapsedSens = NCProbeVariants.convert(ns.srvReqId, mdl, ns.clone().collapse(mdl.model)).map(_.asScala)
 
             /**
               *
@@ -365,17 +366,19 @@
 
                 for (toks ← combos(perm)) {
                     val key = toks.map(_.index).sorted
-                    val sparsity = U.calcSparsity(key)
 
                     if (!cache.contains(key)) {
-                        var seq: Seq[Seq[Complex]] = null
+                        cache += key
+
+                        lazy val dslCombs = convert(ns, collapsedSens, toks).groupBy(_.length)
+                        lazy val sparsity = U.calcSparsity(key)
 
                         // Attempt to match each element.
                         for (elm ← mdl.elements.values if !alreadyMarked(toks, elm.getId)) {
                             var found = false
 
                             def addMatch(
-                                elm: NCElement, toks: Seq[NCNlpSentenceToken], syn: NCProbeSynonym, parts: Seq[NCToken]
+                                elm: NCElement, toks: Seq[NCNlpSentenceToken], syn: NCProbeSynonym, parts: Seq[(NCToken, NCSynonymChunkKind)]
                             ): Unit =
                                 if (
                                     (elm.getJiggleFactor.isEmpty || elm.getJiggleFactor.get() >= sparsity) &&
@@ -425,25 +428,21 @@
                             if (mdl.synonymsDsl.nonEmpty) {
                                 found = false
 
-                                if (collapsedSens == null)
-                                    collapsedSens =
-                                        NCProbeVariants.
-                                            convert(ns.srvReqId, mdl, ns.clone().collapse(mdl.model)).map(_.asScala)
-
-                                if (seq == null)
-                                    seq = convert(ns, collapsedSens, toks)
-
                                 for (
-                                    comb ← seq;
-                                    syn ← fastAccess(mdl.synonymsDsl, elm.getId, comb.length).getOrElse(Seq.empty)
-                                    if !found
+                                    (len, seq) ← dslCombs;
+                                    syn ← fastAccess(mdl.synonymsDsl, elm.getId, len).getOrElse(Seq.empty);
+                                    comb ← seq if !found;
+                                    data = comb.map(_.data)
                                 )
-                                    if (syn.isMatch(comb.map(_.data)))
-                                        addMatch(elm, toks, syn, comb.filter(_.isToken).map(_.token))
+                                    if (syn.isMatch(data)) {
+                                        val parts = comb.zip(syn.map(_.kind)).flatMap {
+                                            case (complex, kind) ⇒ if (complex.isToken) Some(complex.token → kind) else None
+                                        }
+
+                                        addMatch(elm, toks, syn, parts)
+                                    }
                             }
                         }
-
-                        cache += key
                     }
                 }
             }

diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
index 8912bc8..35e8e87 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala

@@ -30,6 +30,11 @@
     private def checkText(t: NCToken, txt: String): Unit =
         require(t.getOriginalText == txt, s"Expected text: $txt, token: $t")
 
+    private def checkToken(t: NCToken, id: String, txt: String): Unit = {
+        checkId(t, id)
+        checkText(t, txt)
+    }
+
     override def onContext(ctx: NCContext): NCResult = {
         val variants = ctx.getVariants.asScala
 
@@ -49,6 +54,18 @@
             )
         }
 
+        def checkWrapAnyWord(t: NCToken, any: String): Unit = {
+            val parts = t.getPartTokens.asScala
+
+            require(parts.size == 2)
+
+            checkToken(parts.head, "nlpcraft:nlp", "the")
+            checkToken(parts.last, "anyWord", any)
+
+            require(parts.last.isAbstract, s"Unexpected abstract token: ${parts.last}")
+
+        }
+
         ctx.getRequest.getNormalizedText match {
             case "word the word" ⇒
                 require(variants.size == 1)
@@ -57,20 +74,10 @@
 
                 require(toks.size == 2)
 
-                checkId(toks.head, "nlpcraft:nlp")
-                checkText(toks.head, "word")
+                checkToken(toks.head, "nlpcraft:nlp", "word")
+                checkToken(toks.last, "wrapAnyWord", "the word")
 
-                checkId(toks.last, "wrapAnyWord")
-                checkText(toks.last, "the word")
-
-                val t2Parts = toks.last.getPartTokens.asScala
-
-                require(t2Parts.size == 2)
-
-                checkId(t2Parts.head,"anyWord")
-                checkId(t2Parts.last, "anyWord")
-
-                t2Parts.foreach(t ⇒ require(t.isAbstract, s"Unexpected abstract token: $t"))
+                checkWrapAnyWord(toks.last, "word")
 
             case "10 w1 10 w2" ⇒
                 require(variants.nonEmpty)
@@ -85,16 +92,16 @@
 
                 require(toks.size == 2)
 
-                checkText(toks.head, "10")
-                checkText(toks.last,"w1 10 w2")
+                checkToken(toks.head, "nlpcraft:nlp", "10")
+                checkToken(toks.last,"wrapNum", "w1 10 w2")
 
                 val t2Parts = toks.last.getPartTokens.asScala
 
                 require(t2Parts.size == 3)
 
-                checkId(t2Parts.head,"nlpcraft:nlp")
-                checkId(t2Parts(1),"nlpcraft:num")
-                checkId(t2Parts.last,"nlpcraft:nlp")
+                checkToken(t2Parts.head,"nlpcraft:nlp", "w1")
+                checkToken(t2Parts(1),"nlpcraft:num", "10")
+                checkToken(t2Parts.last,"nlpcraft:nlp", "w2")
 
             case "before limit top 6 the any" ⇒
                 require(variants.nonEmpty)
@@ -109,8 +116,8 @@
 
                 require(toks.size == 2)
 
-                checkText(toks.head, "before limit top 6")
-                checkText(toks.last,"the any")
+                checkToken(toks.head, "wrapLimit", "before limit top 6")
+                checkToken(toks.last, "wrapAnyWord", "the any")
 
                 val wrap = toks.head.getPartTokens.asScala
 
@@ -118,6 +125,7 @@
 
                 checkLimit(wrap.last)
 
+                checkWrapAnyWord(toks.last, "any")
             case "a wrap before limit top 6 the any" ⇒
                 require(variants.nonEmpty)
 
@@ -131,9 +139,9 @@
 
                 require(toks.size == 3)
 
-                checkText(toks.head, "a")
-                checkText(toks(1), "wrap before limit top 6")
-                checkText(toks.last,"the any")
+                checkToken(toks.head, "nlpcraft:nlp", "a")
+                checkToken(toks(1), "wrapWrapLimit", "wrap before limit top 6")
+                checkToken(toks.last, "wrapAnyWord", "the any")
 
                 val wrap = toks(1).getPartTokens.asScala
 
@@ -147,6 +155,8 @@
                 require(wrapLimit.getPartTokens.size == 3, s"Parts count: ${wrapLimit.getPartTokens.size()}")
 
                 checkLimit(wrapLimit.getPartTokens.asScala.last)
+
+                checkWrapAnyWord(toks.last, "any")
             case _ ⇒ throw new AssertionError(s"Unexpected request: ${ctx.getRequest.getNormalizedText}")
         }
 

diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
index 4bfd9b8..20bbb56 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala

@@ -242,7 +242,7 @@
 }
 
 @NCTestEnvironment(model = classOf[NCTokenPropertiesModel7], startClient = true)
-class NCTokenPropertiesModel6Spe7 extends NCTestContext {
+class NCTokenPropertiesModel7Spec extends NCTestContext {
     @Test
     def test(): Unit = {
         checkIntent("a b", "onAB")
@@ -281,7 +281,7 @@
 }
 
 @NCTestEnvironment(model = classOf[NCTokenPropertiesModel8], startClient = true)
-class NCTokenPropertiesModel6Spe8 extends NCTestContext {
+class NCTokenPropertiesModel8Spec extends NCTestContext {
     @Test
     def test(): Unit = {
         checkIntent("a b", "onAB")
commit	60bedc363851814310d251e29de4bbe45f0e3128	[log] [tgz]
author	Sergey Kamov <skhdlemail@gmail.com>	Thu Mar 04 16:14:38 2021 +0300
committer	Sergey Kamov <skhdlemail@gmail.com>	Thu Mar 04 16:14:38 2021 +0300
tree	009d6258b2bda0ff25096cf5dac9d60ed37fe9cb
parent	a13ca5db2c3c3216971715a4d45fb10ee1ddb134 [diff]