Part tokens detection logic fixes.

commit: 70fad8d21ea2091defe42242f3484be83132f12f [log] [tgz]
author: Sergey Kamov <skhdlemail@gmail.com> Fri Jun 25 09:50:38 2021 +0300
committer: Sergey Kamov <skhdlemail@gmail.com> Fri Jun 25 09:50:38 2021 +0300
tree: 3458291fec17a92b961ada327e8e664563154ba0
parent: 4f462e34789fd9d23bd69b4939008fc1ce9c3c2c [diff]
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
index f3122b3..bcf2c9c 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala

@@ -20,8 +20,8 @@
 import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence => NlpSentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
 import org.apache.nlpcraft.common.{NCE, TOK_META_ALIASES_KEY}
+import org.apache.nlpcraft.model.NCVariant
 import org.apache.nlpcraft.model.impl.{NCTokenImpl, NCTokenLogger, NCVariantImpl}
-import org.apache.nlpcraft.model.{NCToken, NCVariant}
 
 import java.io.{Serializable => JSerializable}
 import java.util
@@ -37,18 +37,6 @@
     private final val IDXS: JSerializable = singletonList(IDX).asInstanceOf[JSerializable]
     private final val IDXS2: JSerializable = singletonList(singletonList(IDX)).asInstanceOf[JSerializable]
 
-    case class Key(id: String, from: Int, to: Int)
-
-    object Key {
-        def apply(m: util.HashMap[String, JSerializable]): Key = {
-            def get[T](name: String): T = m.get(name).asInstanceOf[T]
-
-            Key(get("id"), get("startcharindex"), get("endcharindex"))
-        }
-
-        def apply(t: NCToken): Key = Key(t.getId, t.getStartCharIndex, t.getEndCharIndex)
-    }
-
     /**
       *
       * @param t
@@ -77,17 +65,17 @@
       *
       * @param key
       * @param delNotes
-      * @param noteTypePred
+      * @param delNoteTypePred
       * @return
       */
     private def findDeletedToken(
-        key: Key,
+        key: NCTokenPartKey,
         delNotes: Map[NlpNote, Seq[NlpToken]],
-        noteTypePred: String => Boolean
+        delNoteTypePred: NlpNote => Boolean
     ): Option[NlpToken] =
         delNotes.to(LazyList).
             flatMap { case (delNote, delNoteToks) =>
-                if (noteTypePred(delNote.noteType)) {
+                if (delNoteTypePred(delNote)) {
                     val toks =
                         delNoteToks.
                             dropWhile(_.startCharIndex != key.from).
@@ -111,7 +99,7 @@
                                     case _ => // No-op.
                                 }
 
-                                artTok.add(delNote.clone(ps.toSeq :_*))
+                                artTok.add(delNote.clone(ps.toSeq: _*))
                             }
 
                             Some(artTok)
@@ -200,18 +188,18 @@
                 }
 
                 val toks = nlpSen.map(mkToken)
-                val keys2Toks = toks.map(t => Key(t) -> t).toMap
+                val keys2Toks = toks.map(t => NCTokenPartKey(t) -> t).toMap
 
                 def process(tok: NCTokenImpl, tokNlp: NlpToken): Unit = {
-                    val optList: Option[util.List[util.HashMap[String, JSerializable]]] =
+                    val optList: Option[util.List[NCTokenPartKey]] =
                         tokNlp.find(_.isUser) match {
                             case Some(u) => u.dataOpt("parts")
                             case None => None
                         }
 
                     optList match {
-                        case Some(list) =>
-                            val keys = list.asScala.map(Key(_))
+                        case Some(keysJava) =>
+                            val keys = keysJava.asScala
 
                             val parts = keys.map(key =>
                                 keys2Toks.get(key) match {
@@ -221,7 +209,11 @@
                                         val delNotes = nlpSen.getDeletedNotes
 
                                         // Tries to find with same key.
-                                        var nlpTokOpt = findDeletedToken(key, delNotes, _ == key.id)
+                                        var nlpTokOpt = findDeletedToken(
+                                            key,
+                                            delNotes,
+                                            (delNote: NlpNote) => key.similar(delNote)
+                                        )
 
                                         // If couldn't find nlp note, we can try to find any note on the same position.
                                         if (nlpTokOpt.isEmpty && key.id == "nlpcraft:nlp")
@@ -249,10 +241,10 @@
                                 }
                             )
 
-                            parts.zip(list.asScala).foreach { case (part, map) =>
-                                map.get(TOK_META_ALIASES_KEY) match {
+                            parts.zip(keys).foreach { case (part, key) =>
+                                key.aliases match {
                                     case null => // No-op.
-                                    case aliases => part.getMetadata.put(TOK_META_ALIASES_KEY, aliases.asInstanceOf[Object])
+                                    case aliases => part.getMetadata.put(TOK_META_ALIASES_KEY, aliases)
                                 }
                             }
 
@@ -267,7 +259,7 @@
                                         getOrElse(throw new NCE(s"Token not found for $tok"))
                                 )
 
-                            ok = ok && !toks.exists(t => t.getId != "nlpcraft:nlp" && keys.contains(Key(t)))
+                            ok = ok && !toks.exists(t => t.getId != "nlpcraft:nlp" && keys.contains(NCTokenPartKey(t)))
                         case None => // No-op.
                     }
                 }

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala
new file mode 100644
index 0000000..c89cae1
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala

@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.probe.mgrs
+
+import org.apache.nlpcraft.common.TOK_META_ALIASES_KEY
+import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
+import org.apache.nlpcraft.model.NCToken
+import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, _}
+
+import java.io.{Serializable => JSerializable}
+import java.util
+import java.util.{List => JList}
+import scala.compat.java8.OptionConverters.RichOptionalGeneric
+import scala.jdk.CollectionConverters.{MapHasAsJava, MapHasAsScala}
+import scala.language.implicitConversions
+import scala.collection.mutable
+
+/**
+  *
+  */
+object NCTokenPartKey {
+    def apply(m: util.HashMap[String, JSerializable]): NCTokenPartKey = {
+        def get[T](name: String): T = m.get(name).asInstanceOf[T]
+
+        NCTokenPartKey(get("id"), get("startcharindex"), get("endcharindex"), get("data"))
+    }
+
+    def apply(part: NCToken, kind: NCSynonymChunkKind): NCTokenPartKey = {
+        val id = part.getId
+
+        val m: Map[String, Any] =
+            if (kind != TEXT)
+                id match {
+                    case "nlpcraft:relation" =>
+                        Map(
+                            "type" -> part.meta[String](s"$id:type"),
+                            "note" -> part.meta[String](s"$id:note")
+                        )
+                    case "nlpcraft:limit" =>
+                        Map(
+                            "limit" -> part.meta[Double](s"$id:limit"),
+                            "note" -> part.meta[String](s"$id:note")
+                        )
+                    case "nlpcraft:sort" =>
+                        val m = mutable.HashMap.empty[String, Any]
+
+                        def add(name: String): Unit =
+                            part.metaOpt[JList[String]](s"$id:$name").asScala match {
+                                case Some(list) => m += name -> list
+                                case None => // No-op.
+                            }
+
+                        add("subjnotes")
+                        add("bynotes")
+
+                        m.toMap
+                    case _ => Map.empty
+                }
+            else
+                Map.empty
+
+        val key = new NCTokenPartKey(
+            if (kind == TEXT) "nlpcraft:nlp" else id,
+            part.getStartCharIndex,
+            part.getEndCharIndex,
+            m.asJava
+        )
+
+        key.aliases = part.getMetadata.get(TOK_META_ALIASES_KEY)
+
+        key
+    }
+
+    def apply(t: NCToken): NCTokenPartKey =
+        new NCTokenPartKey(t.getId, t.getStartCharIndex, t.getEndCharIndex, Map.empty[String, Any].asJava)
+
+    def apply(note: NCNlpSentenceNote, sen: NCNlpSentence): NCTokenPartKey =
+        NCTokenPartKey(
+            note.noteType,
+            sen(note.tokenFrom).startCharIndex,
+            sen(note.tokenTo).endCharIndex,
+            Map.empty[String, Any].asJava
+        )
+
+    def apply(note: NCNlpSentenceNote, toks: Seq[NCNlpSentenceToken]): NCTokenPartKey = {
+        val sorted = toks.sortBy(_.index)
+
+        NCTokenPartKey(
+            note.noteType,
+            sorted.head.startCharIndex,
+            sorted.last.endCharIndex,
+            Map.empty[String, Any].asJava
+        )
+    }
+}
+
+/**
+  *
+  * @param id
+  * @param from
+  * @param to
+  * @param data
+  */
+case class NCTokenPartKey(id: String, from: Int, to: Int, data: util.Map[String, Any]) {
+    require(from <= to)
+
+    var aliases: AnyRef = _
+
+    private def in(i: Int): Boolean = i >= from && i <= to
+
+    def intersect(id: String, from: Int, to: Int): Boolean = id == this.id && (in(from) || in(to))
+
+    def similar(note: NCNlpSentenceNote): Boolean =
+        id == note.noteType &&
+        (
+            data.isEmpty ||
+            data.asScala.forall { case (k, v) => note.contains(k) && note.data(k) == v }
+        )
+}
\ No newline at end of file

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 1061ff8..a2deee8 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala

@@ -22,19 +22,18 @@
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
 import org.apache.nlpcraft.model._
 import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
-import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, _}
+import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
 import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
-import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCProbeSynonym => Synonym}
+import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCTokenPartKey, NCProbeSynonym => Synonym}
 
 import java.io.Serializable
-import java.util
 import java.util.{List => JList}
-import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable
-import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, MapHasAsScala, SeqHasAsJava}
+import scala.collection.mutable.ArrayBuffer
 import scala.collection.parallel.CollectionConverters._
+import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, MapHasAsScala, SeqHasAsJava}
 
 /**
   * Model elements enricher.
@@ -185,21 +184,8 @@
             case None => // No-op.
         }
 
-        if (parts.nonEmpty) {
-            val partsData: Seq[util.HashMap[String, Any]] =
-                parts.map { case (part, kind) =>
-                    val m = new util.HashMap[String, Any]()
-
-                    m.put("id", if (kind == TEXT) "nlpcraft:nlp" else part.getId)
-                    m.put("startcharindex", part.getStartCharIndex)
-                    m.put("endcharindex", part.getEndCharIndex)
-                    m.put(TOK_META_ALIASES_KEY, part.getMetadata.get(TOK_META_ALIASES_KEY))
-
-                    m
-                }
-
-            params += "parts" -> partsData.asJava
-        }
+        if (parts.nonEmpty)
+            params += "parts" -> parts.map { case (p, kind) => NCTokenPartKey(p, kind) }.asJava
 
         val idxs = toks.map(_.index).sorted
 

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 339bb4c..74ead87 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala

@@ -21,14 +21,15 @@
 import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
 import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
-import org.apache.nlpcraft.common.{NCE, NCService, U}
+import org.apache.nlpcraft.common.{NCE, NCService, U, _}
 import org.apache.nlpcraft.model.NCModel
+import org.apache.nlpcraft.probe.mgrs.NCTokenPartKey
 
 import java.io.{Serializable => JSerializable}
 import java.util
 import java.util.{List => JList}
 import scala.collection.mutable
-import scala.collection.parallel.CollectionConverters.ImmutableIterableIsParallelizable
+import scala.collection.parallel.CollectionConverters._
 import scala.jdk.CollectionConverters.{ListHasAsScala, SeqHasAsJava, SetHasAsJava}
 import scala.language.implicitConversions
 
@@ -42,23 +43,6 @@
     type CacheValue = Seq[Seq[NCNlpSentenceNote]]
     private val combCache = mutable.HashMap.empty[String, mutable.HashMap[CacheKey, CacheValue]]
 
-    case class PartKey(id: String, start: Int, end: Int) {
-        require(start <= end)
-
-        private def in(i: Int): Boolean = i >= start && i <= end
-        def intersect(id: String, start: Int, end: Int): Boolean = id == this.id && (in(start) || in(end))
-    }
-
-    object PartKey {
-        def apply(m: util.HashMap[String, JSerializable]): PartKey = {
-            def get[T](name: String): T = m.get(name).asInstanceOf[T]
-
-            PartKey(get("id"), get("startcharindex"), get("endcharindex"))
-        }
-
-        def apply(t: NCNlpSentenceNote, sen: NCNlpSentence): PartKey =
-            PartKey(t.noteType, sen(t.tokenFrom).startCharIndex, sen(t.tokenTo).endCharIndex)
-    }
 
     /**
       *
@@ -95,14 +79,14 @@
       *
       * @param notes
       */
-    private def getPartKeys(notes: NCNlpSentenceNote*): Seq[PartKey] =
+    private def getPartKeys(notes: NCNlpSentenceNote*): Seq[NCTokenPartKey] =
         notes.
             filter(_.isUser).
             flatMap(n => {
-                val optList: Option[JList[util.HashMap[String, JSerializable]]] = n.dataOpt("parts")
+                val optList: Option[JList[NCTokenPartKey]] = n.dataOpt("parts")
 
                 optList
-            }).flatMap(_.asScala).map(m => PartKey(m)).distinct
+            }).flatMap(_.asScala).distinct
 
     /**
       *
@@ -666,7 +650,7 @@
                 filter(getPartKeys(_).isEmpty).
                 flatMap(note => {
                     val noteWordsIdxs = note.wordIndexes.toSet
-                    val key = PartKey(note, sen)
+                    val key = NCTokenPartKey(note, sen)
 
                     val delCombOthers =
                         delCombs.filter(_ != note).flatMap(n => if (getPartKeys(n).contains(key)) Some(n) else None)

diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
index 10a28e8..a83f697 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala

@@ -44,8 +44,7 @@
 
             val limNote = limitPart.getMetadata.get("nlpcraft:limit:note").asInstanceOf[String]
 
-            // TODO: wrapAnyWord? - check it (ticket NLPCRAFT-337)
-            require(limNote == "anyWord", s"Unexpected limit token note: '$limNote', token: $limitPart, meta: ${limitPart.getMetadata}")
+            require(limNote == "wrapAnyWord", s"Unexpected limit token note: '$limNote', token: $limitPart, meta: ${limitPart.getMetadata}")
 
             val limIdxs = limitPart.getMetadata.get("nlpcraft:limit:indexes").asInstanceOf[util.List[Integer]].asScala
commit	70fad8d21ea2091defe42242f3484be83132f12f	[log] [tgz]
author	Sergey Kamov <skhdlemail@gmail.com>	Fri Jun 25 09:50:38 2021 +0300
committer	Sergey Kamov <skhdlemail@gmail.com>	Fri Jun 25 09:50:38 2021 +0300
tree	3458291fec17a92b961ada327e8e664563154ba0
parent	4f462e34789fd9d23bd69b4939008fc1ce9c3c2c [diff]