Part tokens detection logic fixes.
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
index f3122b3..bcf2c9c 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
@@ -20,8 +20,8 @@
import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
import org.apache.nlpcraft.common.nlp.{NCNlpSentence => NlpSentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
import org.apache.nlpcraft.common.{NCE, TOK_META_ALIASES_KEY}
+import org.apache.nlpcraft.model.NCVariant
import org.apache.nlpcraft.model.impl.{NCTokenImpl, NCTokenLogger, NCVariantImpl}
-import org.apache.nlpcraft.model.{NCToken, NCVariant}
import java.io.{Serializable => JSerializable}
import java.util
@@ -37,18 +37,6 @@
private final val IDXS: JSerializable = singletonList(IDX).asInstanceOf[JSerializable]
private final val IDXS2: JSerializable = singletonList(singletonList(IDX)).asInstanceOf[JSerializable]
- case class Key(id: String, from: Int, to: Int)
-
- object Key {
- def apply(m: util.HashMap[String, JSerializable]): Key = {
- def get[T](name: String): T = m.get(name).asInstanceOf[T]
-
- Key(get("id"), get("startcharindex"), get("endcharindex"))
- }
-
- def apply(t: NCToken): Key = Key(t.getId, t.getStartCharIndex, t.getEndCharIndex)
- }
-
/**
*
* @param t
@@ -77,17 +65,17 @@
*
* @param key
* @param delNotes
- * @param noteTypePred
+ * @param delNoteTypePred
* @return
*/
private def findDeletedToken(
- key: Key,
+ key: NCTokenPartKey,
delNotes: Map[NlpNote, Seq[NlpToken]],
- noteTypePred: String => Boolean
+ delNoteTypePred: NlpNote => Boolean
): Option[NlpToken] =
delNotes.to(LazyList).
flatMap { case (delNote, delNoteToks) =>
- if (noteTypePred(delNote.noteType)) {
+ if (delNoteTypePred(delNote)) {
val toks =
delNoteToks.
dropWhile(_.startCharIndex != key.from).
@@ -111,7 +99,7 @@
case _ => // No-op.
}
- artTok.add(delNote.clone(ps.toSeq :_*))
+ artTok.add(delNote.clone(ps.toSeq: _*))
}
Some(artTok)
@@ -200,18 +188,18 @@
}
val toks = nlpSen.map(mkToken)
- val keys2Toks = toks.map(t => Key(t) -> t).toMap
+ val keys2Toks = toks.map(t => NCTokenPartKey(t) -> t).toMap
def process(tok: NCTokenImpl, tokNlp: NlpToken): Unit = {
- val optList: Option[util.List[util.HashMap[String, JSerializable]]] =
+ val optList: Option[util.List[NCTokenPartKey]] =
tokNlp.find(_.isUser) match {
case Some(u) => u.dataOpt("parts")
case None => None
}
optList match {
- case Some(list) =>
- val keys = list.asScala.map(Key(_))
+ case Some(keysJava) =>
+ val keys = keysJava.asScala
val parts = keys.map(key =>
keys2Toks.get(key) match {
@@ -221,7 +209,11 @@
val delNotes = nlpSen.getDeletedNotes
// Tries to find with same key.
- var nlpTokOpt = findDeletedToken(key, delNotes, _ == key.id)
+ var nlpTokOpt = findDeletedToken(
+ key,
+ delNotes,
+ (delNote: NlpNote) => key.similar(delNote)
+ )
// If couldn't find nlp note, we can try to find any note on the same position.
if (nlpTokOpt.isEmpty && key.id == "nlpcraft:nlp")
@@ -249,10 +241,10 @@
}
)
- parts.zip(list.asScala).foreach { case (part, map) =>
- map.get(TOK_META_ALIASES_KEY) match {
+ parts.zip(keys).foreach { case (part, key) =>
+ key.aliases match {
case null => // No-op.
- case aliases => part.getMetadata.put(TOK_META_ALIASES_KEY, aliases.asInstanceOf[Object])
+ case aliases => part.getMetadata.put(TOK_META_ALIASES_KEY, aliases)
}
}
@@ -267,7 +259,7 @@
getOrElse(throw new NCE(s"Token not found for $tok"))
)
- ok = ok && !toks.exists(t => t.getId != "nlpcraft:nlp" && keys.contains(Key(t)))
+ ok = ok && !toks.exists(t => t.getId != "nlpcraft:nlp" && keys.contains(NCTokenPartKey(t)))
case None => // No-op.
}
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala
new file mode 100644
index 0000000..c89cae1
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.probe.mgrs
+
+import org.apache.nlpcraft.common.TOK_META_ALIASES_KEY
+import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
+import org.apache.nlpcraft.model.NCToken
+import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, _}
+
+import java.io.{Serializable => JSerializable}
+import java.util
+import java.util.{List => JList}
+import scala.compat.java8.OptionConverters.RichOptionalGeneric
+import scala.jdk.CollectionConverters.{MapHasAsJava, MapHasAsScala}
+import scala.language.implicitConversions
+import scala.collection.mutable
+
+/**
+ *
+ */
+object NCTokenPartKey {
+ def apply(m: util.HashMap[String, JSerializable]): NCTokenPartKey = {
+ def get[T](name: String): T = m.get(name).asInstanceOf[T]
+
+ NCTokenPartKey(get("id"), get("startcharindex"), get("endcharindex"), get("data"))
+ }
+
+ def apply(part: NCToken, kind: NCSynonymChunkKind): NCTokenPartKey = {
+ val id = part.getId
+
+ val m: Map[String, Any] =
+ if (kind != TEXT)
+ id match {
+ case "nlpcraft:relation" =>
+ Map(
+ "type" -> part.meta[String](s"$id:type"),
+ "note" -> part.meta[String](s"$id:note")
+ )
+ case "nlpcraft:limit" =>
+ Map(
+ "limit" -> part.meta[Double](s"$id:limit"),
+ "note" -> part.meta[String](s"$id:note")
+ )
+ case "nlpcraft:sort" =>
+ val m = mutable.HashMap.empty[String, Any]
+
+ def add(name: String): Unit =
+ part.metaOpt[JList[String]](s"$id:$name").asScala match {
+ case Some(list) => m += name -> list
+ case None => // No-op.
+ }
+
+ add("subjnotes")
+ add("bynotes")
+
+ m.toMap
+ case _ => Map.empty
+ }
+ else
+ Map.empty
+
+ val key = new NCTokenPartKey(
+ if (kind == TEXT) "nlpcraft:nlp" else id,
+ part.getStartCharIndex,
+ part.getEndCharIndex,
+ m.asJava
+ )
+
+ key.aliases = part.getMetadata.get(TOK_META_ALIASES_KEY)
+
+ key
+ }
+
+ def apply(t: NCToken): NCTokenPartKey =
+ new NCTokenPartKey(t.getId, t.getStartCharIndex, t.getEndCharIndex, Map.empty[String, Any].asJava)
+
+ def apply(note: NCNlpSentenceNote, sen: NCNlpSentence): NCTokenPartKey =
+ NCTokenPartKey(
+ note.noteType,
+ sen(note.tokenFrom).startCharIndex,
+ sen(note.tokenTo).endCharIndex,
+ Map.empty[String, Any].asJava
+ )
+
+ def apply(note: NCNlpSentenceNote, toks: Seq[NCNlpSentenceToken]): NCTokenPartKey = {
+ val sorted = toks.sortBy(_.index)
+
+ NCTokenPartKey(
+ note.noteType,
+ sorted.head.startCharIndex,
+ sorted.last.endCharIndex,
+ Map.empty[String, Any].asJava
+ )
+ }
+}
+
+/**
+ *
+ * @param id
+ * @param from
+ * @param to
+ * @param data
+ */
+case class NCTokenPartKey(id: String, from: Int, to: Int, data: util.Map[String, Any]) {
+ require(from <= to)
+
+ var aliases: AnyRef = _
+
+ private def in(i: Int): Boolean = i >= from && i <= to
+
+ def intersect(id: String, from: Int, to: Int): Boolean = id == this.id && (in(from) || in(to))
+
+ def similar(note: NCNlpSentenceNote): Boolean =
+ id == note.noteType &&
+ (
+ data.isEmpty ||
+ data.asScala.forall { case (k, v) => note.contains(k) && note.data(k) == v }
+ )
+}
\ No newline at end of file
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 1061ff8..a2deee8 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -22,19 +22,18 @@
import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
import org.apache.nlpcraft.model._
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
-import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, _}
+import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
-import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCProbeSynonym => Synonym}
+import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCTokenPartKey, NCProbeSynonym => Synonym}
import java.io.Serializable
-import java.util
import java.util.{List => JList}
-import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable
-import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, MapHasAsScala, SeqHasAsJava}
+import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.CollectionConverters._
+import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, MapHasAsScala, SeqHasAsJava}
/**
* Model elements enricher.
@@ -185,21 +184,8 @@
case None => // No-op.
}
- if (parts.nonEmpty) {
- val partsData: Seq[util.HashMap[String, Any]] =
- parts.map { case (part, kind) =>
- val m = new util.HashMap[String, Any]()
-
- m.put("id", if (kind == TEXT) "nlpcraft:nlp" else part.getId)
- m.put("startcharindex", part.getStartCharIndex)
- m.put("endcharindex", part.getEndCharIndex)
- m.put(TOK_META_ALIASES_KEY, part.getMetadata.get(TOK_META_ALIASES_KEY))
-
- m
- }
-
- params += "parts" -> partsData.asJava
- }
+ if (parts.nonEmpty)
+ params += "parts" -> parts.map { case (p, kind) => NCTokenPartKey(p, kind) }.asJava
val idxs = toks.map(_.index).sorted
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 339bb4c..74ead87 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -21,14 +21,15 @@
import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
-import org.apache.nlpcraft.common.{NCE, NCService, U}
+import org.apache.nlpcraft.common.{NCE, NCService, U, _}
import org.apache.nlpcraft.model.NCModel
+import org.apache.nlpcraft.probe.mgrs.NCTokenPartKey
import java.io.{Serializable => JSerializable}
import java.util
import java.util.{List => JList}
import scala.collection.mutable
-import scala.collection.parallel.CollectionConverters.ImmutableIterableIsParallelizable
+import scala.collection.parallel.CollectionConverters._
import scala.jdk.CollectionConverters.{ListHasAsScala, SeqHasAsJava, SetHasAsJava}
import scala.language.implicitConversions
@@ -42,23 +43,6 @@
type CacheValue = Seq[Seq[NCNlpSentenceNote]]
private val combCache = mutable.HashMap.empty[String, mutable.HashMap[CacheKey, CacheValue]]
- case class PartKey(id: String, start: Int, end: Int) {
- require(start <= end)
-
- private def in(i: Int): Boolean = i >= start && i <= end
- def intersect(id: String, start: Int, end: Int): Boolean = id == this.id && (in(start) || in(end))
- }
-
- object PartKey {
- def apply(m: util.HashMap[String, JSerializable]): PartKey = {
- def get[T](name: String): T = m.get(name).asInstanceOf[T]
-
- PartKey(get("id"), get("startcharindex"), get("endcharindex"))
- }
-
- def apply(t: NCNlpSentenceNote, sen: NCNlpSentence): PartKey =
- PartKey(t.noteType, sen(t.tokenFrom).startCharIndex, sen(t.tokenTo).endCharIndex)
- }
/**
*
@@ -95,14 +79,14 @@
*
* @param notes
*/
- private def getPartKeys(notes: NCNlpSentenceNote*): Seq[PartKey] =
+ private def getPartKeys(notes: NCNlpSentenceNote*): Seq[NCTokenPartKey] =
notes.
filter(_.isUser).
flatMap(n => {
- val optList: Option[JList[util.HashMap[String, JSerializable]]] = n.dataOpt("parts")
+ val optList: Option[JList[NCTokenPartKey]] = n.dataOpt("parts")
optList
- }).flatMap(_.asScala).map(m => PartKey(m)).distinct
+ }).flatMap(_.asScala).distinct
/**
*
@@ -666,7 +650,7 @@
filter(getPartKeys(_).isEmpty).
flatMap(note => {
val noteWordsIdxs = note.wordIndexes.toSet
- val key = PartKey(note, sen)
+ val key = NCTokenPartKey(note, sen)
val delCombOthers =
delCombs.filter(_ != note).flatMap(n => if (getPartKeys(n).contains(key)) Some(n) else None)
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
index 10a28e8..a83f697 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
@@ -44,8 +44,7 @@
val limNote = limitPart.getMetadata.get("nlpcraft:limit:note").asInstanceOf[String]
- // TODO: wrapAnyWord? - check it (ticket NLPCRAFT-337)
- require(limNote == "anyWord", s"Unexpected limit token note: '$limNote', token: $limitPart, meta: ${limitPart.getMetadata}")
+ require(limNote == "wrapAnyWord", s"Unexpected limit token note: '$limNote', token: $limitPart, meta: ${limitPart.getMetadata}")
val limIdxs = limitPart.getMetadata.get("nlpcraft:limit:indexes").asInstanceOf[util.List[Integer]].asScala