Merge branch 'master' into NLPCRAFT-70_NEW
diff --git a/nlpcraft/pom.xml b/nlpcraft/pom.xml
index 42726f7..adfbf52 100644
--- a/nlpcraft/pom.xml
+++ b/nlpcraft/pom.xml
@@ -232,6 +232,11 @@
<groupId>org.jline</groupId>
<artifactId>jline</artifactId>
</dependency>
+ <!-- TODO: add this library licence description. -->
+ <dependency>
+ <groupId>org.jibx</groupId>
+ <artifactId>jibx-tools</artifactId>
+ </dependency>
<!-- Test dependencies. -->
<dependency>
diff --git a/nlpcraft/src/main/resources/log4j2.xml b/nlpcraft/src/main/resources/log4j2.xml
index d9a627b..44590c3 100644
--- a/nlpcraft/src/main/resources/log4j2.xml
+++ b/nlpcraft/src/main/resources/log4j2.xml
@@ -36,7 +36,7 @@
<AppenderRef ref="stdout"/>
<AppenderRef ref="stderr"/>
</Root>
- <Logger name="org.apache.nlpcraft" level="INFO" additivity="false">
+ <Logger name="org.apache.nlpcraft" level="${env:NLPCRAFT_LOG_LEVEL:-INFO}" additivity="false">
<AppenderRef ref="stdout"/>
<AppenderRef ref="stderr"/>
</Logger>
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index f508745..7b5d058 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -18,6 +18,7 @@
package org.apache.nlpcraft.common.nlp
import org.apache.nlpcraft.common._
+import org.apache.nlpcraft.server.mdo.NCCtxWordCategoriesConfigMdo
import java.io.{Serializable => JSerializable}
import java.util.{Collections, List => JList}
@@ -40,6 +41,7 @@
* @param srvReqId Server request ID.
* @param text Normalized text.
* @param enabledBuiltInToks Enabled built-in tokens.
+ * @param ctxWordConfig Machine learning configuration. Optional.
* @param tokens Initial buffer.
* @param firstProbePhase Processing phase flag.
* @param deletedNotes Deleted overridden notes with their tokens.
@@ -50,6 +52,8 @@
val srvReqId: String,
val text: String,
val enabledBuiltInToks: Set[String],
+ val ctxWordConfig: Option[NCCtxWordCategoriesConfigMdo] = None,
+ var ctxWordCategories: Map[/** Token index*/Int, Map[/** Elements ID*/String, /** Confidence*/Double]] = Map.empty,
override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32),
var firstProbePhase: Boolean = true,
private val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty,
@@ -67,6 +71,7 @@
srvReqId = srvReqId,
text = text,
enabledBuiltInToks = enabledBuiltInToks,
+ ctxWordConfig = ctxWordConfig,
tokens = tokens.map(_.clone()),
deletedNotes = deletedNotes.map(p => p._1.clone() -> p._2.map(_.clone())),
initNlpNotes = initNlpNotes,
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/pos/NCPennTreebank.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/pos/NCPennTreebank.scala
index a61c63a..0c6e0de 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/pos/NCPennTreebank.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/pos/NCPennTreebank.scala
@@ -68,7 +68,9 @@
final val SYNTH_POS_DESC = "Synthetic tag"
// Useful POS tags sets.
- final val NOUNS_POS = Seq("NN", "NNS", "NNP", "NNPS")
+ final val NOUNS_POS_PLURALS = Seq("NNS", "NNPS")
+ final val NOUNS_POS_SINGULAR = Seq("NN", "NNP")
+ final val NOUNS_POS = NOUNS_POS_PLURALS ++ NOUNS_POS_SINGULAR
final val VERBS_POS = Seq("VB", "VBD", "VBG", "VBN", "VBP", "VBZ")
final val WHS_POS = Seq("WDT", "WP", "WP$", "WRB")
final val JJS_POS = Seq("JJ", "JJR", "JJS")
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/package.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/package.scala
index da194cc..e2b4bf1 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/package.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/package.scala
@@ -37,8 +37,8 @@
final val U = NCUtils
// Internal deep debug flag (more verbose tracing).
- final val DEEP_DEBUG = false
-
+ final val DEEP_DEBUG = U.isSysEnvSet("NLPCRAFT_DEEP_DEBUG")
+
// Model and token **internal** metadata keys.
final val TOK_META_ALIASES_KEY = "__NLPCRAFT_TOK_META_ALIASES"
final val MDL_META_MODEL_CLASS_KEY = "__NLPCRAFT_MDL_CLASS_NAME"
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCElement.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCElement.java
index 9f5872a..0e119c0 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCElement.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCElement.java
@@ -382,4 +382,9 @@
default Optional<Boolean> isSparse() {
return Optional.empty();
}
+
+ // TODO:
+ default Optional<Double> getCategoryConfidence() {
+ return Optional.empty();
+ }
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java
index c313bf7..469858e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java
@@ -349,15 +349,20 @@
@Override
public Optional<Boolean> isPermutateSynonyms() {
- return nvl(js.isPermutateSynonyms(), proxy.isPermutateSynonyms());
+ return nvlOpt(js.isPermutateSynonyms(), proxy.isPermutateSynonyms());
}
@Override
public Optional<Boolean> isSparse() {
- return nvl(js.isSparse(), proxy.isSparse());
+ return nvlOpt(js.isSparse(), proxy.isSparse());
}
- private<T> Optional<T> nvl(T t, T dflt) {
+ @Override
+ public Optional<Double> getCategoryConfidence() {
+ return Optional.ofNullable(js.getCategoryConfidence());
+ }
+
+ private<T> Optional<T> nvlOpt(T t, T dflt) {
return Optional.of(t != null ? t : dflt);
}
};
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenLogger.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenLogger.scala
index 2bbc72a..2a44f39 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenLogger.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenLogger.scala
@@ -17,10 +17,9 @@
package org.apache.nlpcraft.model.impl
-import java.text.SimpleDateFormat
+import java.text.{DecimalFormat, SimpleDateFormat}
import java.util
import java.util.{List => JList}
-
import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.common._
import org.apache.nlpcraft.common.ascii._
@@ -38,6 +37,8 @@
//noinspection DuplicatedCode
object NCTokenLogger extends LazyLogging {
case class NoteMetadata(noteType: String, filtered: Seq[String], isFull: Boolean)
+
+ private final val FMT_NUM = new DecimalFormat("#0.00000")
// Order and sorting of notes for ASCII output.
private final val NOTE_TYPES = Seq[String](
@@ -617,6 +618,11 @@
if (parts.nonEmpty)
s = s"$s, parts=[$parts]"
+ t.meta(s"${t.getId}:confidence").asInstanceOf[java.lang.Double] match {
+ case null => // No-op.
+ case conf => s = s"$s, confidence=${FMT_NUM.format(conf)}"
+ }
+
s
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCElementJson.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCElementJson.java
index addca45..8217a6a 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCElementJson.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCElementJson.java
@@ -36,6 +36,8 @@
private Boolean isPermutateSynonyms;
// Can be null.
private Boolean isSparse;
+ // Can be null.
+ private Double categoryConfidence;
public String getParentId() {
return parentId;
@@ -97,4 +99,10 @@
public void setSparse(Boolean sparse) {
isSparse = sparse;
}
+ public Double getCategoryConfidence() {
+ return categoryConfidence;
+ }
+ public void setCategoryConfidence(Double categoryConfidence) {
+ this.categoryConfidence = categoryConfidence;
+ }
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
index d2a4619..1c0add2 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
@@ -32,9 +32,10 @@
import java.net.{InetAddress, NetworkInterface}
import java.util
import java.util.concurrent.CountDownLatch
-import java.util.{Properties, TimeZone}
+import java.util.{Collections, Properties, TimeZone}
import scala.collection.mutable
-import scala.jdk.CollectionConverters.{SetHasAsJava, SetHasAsScala}
+import scala.compat.java8.OptionConverters.RichOptionalGeneric
+import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, SeqHasAsJava, SetHasAsJava, SetHasAsScala}
/**
* Probe down/up link connection manager.
@@ -214,6 +215,51 @@
NCModelManager.getAllModels().map(wrapper => {
val mdl = wrapper.model
+ val (
+ singleValues,
+ corpus,
+ categoriesElements
+ ): (
+ java.util.Map[String, java.util.Map[String, java.util.Set[String]]],
+ java.util.Set[String],
+ java.util.Map[String, java.lang.Double]
+ ) = {
+ val ctxCatElems = mdl.getElements.asScala.flatMap(e =>
+ e.getCategoryConfidence.asScala match {
+ case Some(v) => Some(e.getId -> v)
+ case None => None
+ }
+ ).toMap
+
+ if (ctxCatElems.isEmpty)
+ (Collections.emptyMap(), Collections.emptySet(), Collections.emptyMap())
+ else {
+ val values =
+ mdl.getElements.
+ asScala.
+ filter(p => ctxCatElems.contains(p.getId)).
+ map(e =>
+ e.getId ->
+ e.getValues.asScala.map(p => p.getName -> {
+ val set: util.Set[String] =
+ new util.HashSet(
+ p.getSynonyms.asScala.filter(p => !p.contains(" ")).asJava
+ )
+
+ set.add(p.getName)
+
+ set
+ }).toMap.asJava
+ ).toMap
+
+ (
+ values.asJava,
+ wrapper.samples.flatMap(_._2.flatMap(p => p)).asJava,
+ ctxCatElems.asJava
+ )
+ }
+ }
+
// Model already validated.
// util.HashSet created to avoid scala collections serialization error.
@@ -223,7 +269,10 @@
mdl.getName,
mdl.getVersion,
new util.HashSet[String](mdl.getEnabledBuiltInTokens),
- new util.HashSet[String](mdl.getElements.asScala.map(_.getId).asJava)
+ new util.HashSet[String](mdl.getElements.asScala.map(_.getId).asJava),
+ singleValues,
+ corpus,
+ categoriesElements
)
})
), cryptoKey)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
index 8d89477..bc7c5ed 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
@@ -92,6 +92,9 @@
private final val SEPARATORS = Seq('?', ',', '.', '-', '!')
private final val SUSP_SYNS_CHARS = Seq("?", "*", "+")
+ private final val MAX_CTXWORD_VALS_CNT = 10000
+ private final val MAX_CTXWORD_SAMPLES_CNT = 1000
+
@volatile private var data: mutable.ArrayBuffer[NCProbeModel] = _
@volatile private var mdlFactory: NCModelFactory = _
@@ -445,6 +448,44 @@
s"max=$maxCnt" +
s"]")
+ // Validates context words parameters.
+ val elems = mdl.getElements.asScala
+
+ val ctxCatElems = elems.flatMap(e =>
+ e.getCategoryConfidence.asScala match {
+ case Some(v) => Some(e.getId -> v)
+ case None => None
+ }
+ ).toMap
+
+ if (ctxCatElems.nonEmpty) {
+ val ids = ctxCatElems.filter { case (_, conf) => conf < 0 || conf > 1 }.keys
+
+ if (ids.nonEmpty)
+ // TODO:
+ throw new NCE(s"Context word confidences are out of range (0..1) for elements : ${ids.mkString(", ")}")
+
+ val cnt =
+ elems.map(e =>
+ if (e.getValues != null)
+ e.getValues.asScala.map(
+ p => if (p.getSynonyms != null) p.getSynonyms.asScala.count(!_.contains(" ")) else 0
+ ).sum + 1 // 1 for value name.
+ else
+ 0
+ ).sum
+
+ if (cnt > MAX_CTXWORD_VALS_CNT)
+ // TODO: do we need print recommended value.?
+ logger.warn(
+ s"Too many values synonyms detected for context words elements [" +
+ s"mdlId=$mdlId, " +
+ s"cnt=$cnt," +
+ s"recommendedMax=$MAX_CTXWORD_VALS_CNT" +
+ s"]"
+ )
+ }
+
// Discard value loaders.
for (elm <- mdl.getElements.asScala)
elm.getValueLoader.ifPresent(_.onDiscard())
@@ -530,11 +571,23 @@
else
logger.warn(s"Model has no intent: $mdlId")
- def toMap(set: Set[SynonymHolder]): Map[String, Seq[NCProbeSynonym]] =
- set.groupBy(_.elmId).map(p => p._1 -> p._2.map(_.syn).toSeq.sorted.reverse)
+ val samples = scanSamples(mdl)
+
+ if (ctxCatElems.nonEmpty && samples.size > MAX_CTXWORD_SAMPLES_CNT)
+ // TODO: do we need print recommended value.?
+ logger.warn(
+ s"Too many samples detected for context words elements [" +
+ s"mdlId=$mdlId, " +
+ s"cnt=${samples.size}," +
+ s"recommended=$MAX_CTXWORD_SAMPLES_CNT" +
+ s"]"
+ )
val simple = idl(syns.toSet, idl = false)
+ def toMap(set: Set[SynonymHolder]): Map[String, Seq[NCProbeSynonym]] =
+ set.groupBy(_.elmId).map(p => p._1 -> p._2.map(_.syn).toSeq.sorted.reverse)
+
NCProbeModel(
model = mdl,
solver = solver,
@@ -554,7 +607,7 @@
exclStopWordsStems = exclStopWords,
suspWordsStems = suspWords,
elements = mdl.getElements.asScala.map(elm => (elm.getId, elm)).toMap,
- samples = scanSamples(mdl)
+ samples = samples
)
}
@@ -1692,7 +1745,7 @@
s"origin=${mdl.getOrigin}, " +
s"intentIds=${unusedIntents.map(_.id).mkString("(", ", ", ")")}]"
)
-
+
intents.toSet
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 1f81711..67bacf0 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -29,6 +29,7 @@
import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCTokenPartKey, NCProbeSynonym => Synonym}
import java.io.Serializable
+import java.lang
import java.util.{List => JList}
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
@@ -447,6 +448,18 @@
startScopedSpan(
"enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
) { span =>
+ if (ns.firstProbePhase)
+ for ((tokIdx, map) <- ns.ctxWordCategories; (elemId, conf) <- map)
+ mark(
+ ns = ns,
+ elem =
+ mdl.elements.find(_._1 == elemId).
+ getOrElse(throw new NCE(s"Element not found: $elemId"))._2,
+ toks = Seq(ns.tokens(tokIdx)),
+ direct = true,
+ metaOpt = Some(Map("confidence" -> lang.Double.valueOf(conf)))
+ )
+
val req = NCRequestImpl(senMeta, ns.srvReqId)
val combToks = combos(ns.toSeq)
lazy val ch = mkComplexes(mdl, ns)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala
index 1b6001b..a727912 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala
@@ -19,6 +19,14 @@
import org.apache.nlpcraft.server.mdo.impl._
+@NCMdoEntity(sql = false)
+case class NCCtxWordCategoriesConfigMdo(
+ @NCMdoField probeId: String,
+ @NCMdoField modelId: String,
+ @NCMdoField singleValues: Map[String /*Element ID*/, Map[/*Value*/String, /*Synonym*/Set[String]]],
+ @NCMdoField corpus: Set[String],
+ @NCMdoField elements: Map[String /*Element ID*/, /*Confidence*/ Double]
+)
/**
* Probe model MDO.
*/
@@ -28,7 +36,8 @@
@NCMdoField name: String,
@NCMdoField version: String,
@NCMdoField enabledBuiltInTokens: Set[String],
- @NCMdoField elementIds: Set[String]
+ @NCMdoField elementIds: Set[String],
+ @NCMdoField ctxWordConfig: Option[NCCtxWordCategoriesConfigMdo]
) extends NCAnnotatedMdo[NCProbeModelMdo] {
override def hashCode(): Int = s"$id$name".hashCode()
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
index 636b263..32e8909 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
@@ -26,9 +26,11 @@
import org.apache.nlpcraft.common.{NCService, _}
import org.apache.nlpcraft.server.ignite.NCIgniteHelpers._
import org.apache.nlpcraft.server.ignite.NCIgniteInstance
+import org.apache.nlpcraft.server.mdo.NCCtxWordCategoriesConfigMdo
import org.apache.nlpcraft.server.nlp.core.{NCNlpNerEnricher, NCNlpServerManager}
import org.apache.nlpcraft.server.nlp.enrichers.basenlp.NCBaseNlpEnricher
import org.apache.nlpcraft.server.nlp.enrichers.coordinate.NCCoordinatesEnricher
+import org.apache.nlpcraft.server.nlp.enrichers.ctxword.NCContextWordCategoriesEnricher
import org.apache.nlpcraft.server.nlp.enrichers.date.NCDateEnricher
import org.apache.nlpcraft.server.nlp.enrichers.geo.NCGeoEnricher
import org.apache.nlpcraft.server.nlp.enrichers.numeric.NCNumericEnricher
@@ -90,6 +92,7 @@
* @param srvReqId Server request ID.
* @param normTxt Normalized text.
* @param enabledBuiltInToks Enabled built-in tokens.
+ * @param ctxWordCatConf Machine learning configuration.
* @param parent Optional parent span.
* @return
*/
@@ -97,9 +100,11 @@
srvReqId: String,
normTxt: String,
enabledBuiltInToks: Set[String],
- parent: Span = null): NCNlpSentence =
+ ctxWordCatConf: Option[NCCtxWordCategoriesConfigMdo],
+ parent: Span = null
+ ): NCNlpSentence =
startScopedSpan("process", parent, "srvReqId" -> srvReqId, "txt" -> normTxt) { span =>
- val s = new NCNlpSentence(srvReqId, normTxt, enabledBuiltInToks)
+ val s = new NCNlpSentence(srvReqId, normTxt, enabledBuiltInToks, ctxWordCatConf)
// Server-side enrichment pipeline.
// NOTE: order of enrichers is IMPORTANT.
@@ -121,6 +126,8 @@
NCCoordinatesEnricher.enrich(s, span)
}
+ NCContextWordCategoriesEnricher.enrich(s, span)
+
ner(s, enabledBuiltInToks)
prepareAsciiTable(s).info(logger, Some(s"Server-side enrichment (built-in tokens only) for: '$normTxt'"))
@@ -134,6 +141,7 @@
* @param srvReqId Server request ID.
* @param txt Input text.
* @param enabledBuiltInToks Set of enabled built-in token IDs.
+ * @param ctxWordCatConf Machine learning configuration.
* @param parent Optional parent span.
*/
@throws[NCE]
@@ -141,29 +149,34 @@
srvReqId: String,
txt: String,
enabledBuiltInToks: Set[String],
- parent: Span = null): NCNlpSentence = {
+ ctxWordCatConf: Option[NCCtxWordCategoriesConfigMdo],
+ parent: Span = null
+ ): NCNlpSentence = {
startScopedSpan("enrichPipeline", parent, "srvReqId" -> srvReqId, "txt" -> txt) { span =>
val normTxt = NCPreProcessManager.normalize(txt, spellCheck = true, span)
if (normTxt != txt)
logger.info(s"Sentence normalized: $normTxt")
- val normEnabledBuiltInToks = enabledBuiltInToks.map(_.toLowerCase)
+ def execute(): NCNlpSentence = process(srvReqId, normTxt, enabledBuiltInToks, ctxWordCatConf, span)
- catching(wrapIE) {
- cache(normTxt) match {
- case Some(h) =>
- if (h.enabledBuiltInTokens == normEnabledBuiltInToks) {
- prepareAsciiTable(h.sentence).info(logger, Some(s"Sentence enriched (from cache): '$normTxt'"))
+ if (U.isSysEnvSet("NLPCRAFT_DISABLE_SENTENCE_CACHE"))
+ execute()
+ else
+ catching(wrapIE) {
+ cache(normTxt) match {
+ case Some(h) =>
+ if (h.enabledBuiltInTokens == enabledBuiltInToks.map(_.toLowerCase)) {
+ prepareAsciiTable(h.sentence).info(logger, Some(s"Sentence enriched (from cache): '$normTxt'"))
- h.sentence
- }
- else
- process(srvReqId, normTxt, enabledBuiltInToks, span)
- case None =>
- process(srvReqId, normTxt, enabledBuiltInToks, span)
+ h.sentence
+ }
+ else
+ execute()
+ case None =>
+ execute()
+ }
}
- }
}
}
@@ -273,7 +286,8 @@
() => NCDateEnricher.start(span),
() => NCNumericEnricher.start(span),
() => NCGeoEnricher.start(span),
- () => NCCoordinatesEnricher.start(span)
+ () => NCCoordinatesEnricher.start(span),
+ () => NCContextWordCategoriesEnricher.start(span)
)
}
@@ -291,6 +305,7 @@
ackStopping()
if (Config.isBuiltInEnrichers) {
+ NCContextWordCategoriesEnricher.stop(span)
NCCoordinatesEnricher.stop(span)
NCGeoEnricher.stop(span)
NCNumericEnricher.stop(span)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordCategoriesEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordCategoriesEnricher.scala
new file mode 100644
index 0000000..4a827a2
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordCategoriesEnricher.scala
@@ -0,0 +1,626 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.server.nlp.enrichers.ctxword
+
+import io.opencensus.trace.Span
+import org.apache.nlpcraft.common.ascii.NCAsciiTable
+import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager.stem
+import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank._
+import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken}
+import org.apache.nlpcraft.common.{DEEP_DEBUG, NCE, NCService}
+import org.apache.nlpcraft.server.mdo.NCCtxWordCategoriesConfigMdo
+import org.apache.nlpcraft.server.nlp.core.{NCNlpParser, NCNlpServerManager, NCNlpWord}
+import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher
+import org.apache.nlpcraft.server.sugsyn.{NCSuggestSynonymManager, NCSuggestionRequest => Request, NCWordSuggestion => Suggestion}
+import org.jibx.schema.codegen.extend.DefaultNameConverter
+
+import java.text.DecimalFormat
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+import scala.concurrent.Await
+import scala.concurrent.duration.Duration
+
+/**
+ * ContextWord enricher.
+ * Starting the server, set following environment variables for deep debugging.
+ * - NLPCRAFT_LOG_LEVEL=TRACE
+ * - NLPCRAFT_DEEP_DEBUG=true
+ * - NLPCRAFT_DISABLE_SENTENCE_CACHE=true
+ */
+object NCContextWordCategoriesEnricher extends NCServerEnricher {
+ private final val MAX_CTXWORD_SCORE = 2
+ private final val INCL_MAX_CONFIDENCE = 1.0
+
+ private final val CONVERTER = new DefaultNameConverter
+ private final val FMT = new DecimalFormat("#0.00000")
+
+ private case class Reason(word: String, suggConf: Double, valOrCorpConf: Double) {
+ override def toString: String =
+ s"Word: $word, confidences: suggestion=${FMT.format(suggConf)}, value or corpus=${FMT.format(valOrCorpConf)}"
+ }
+
+ private case class Confidence(value: Double, reason: Option[Reason] = None) {
+ override def toString: String =
+ s"${FMT.format(value)}(${if (reason.isDefined) s"via:'${reason.get}'" else "direct"})}"
+ }
+
+ private case class ModelProbeKey(probeId: String, modelId: String)
+
+ private case class ElementConfidence(elementId: String, confidence: Confidence) {
+ override def toString: String = s"Element [id=$elementId, confidence=$confidence]]"
+ }
+
+ // Maps: Key is word, values are all element IDs.
+ private case class ValuesHolder(normal: Map[String, Set[String]], stems: Map[String, Set[String]]) {
+ private def map2Str(m: Map[String, Set[String]]): String =
+ m.toSeq.flatMap { case (v, elems) =>
+ elems.toSeq.map(_ -> v) }.groupBy { case (v, _) => v }.map { case (v, seq) => v -> toStr(seq.map(_._2))
+ }.mkString(", ")
+
+ override def toString: String = s"Values [normal=${map2Str(normal)}, stems=${map2Str(stems)}]"
+ }
+
+ // Maps: Key is elementID, values are all values synonyms for this element.
+ private case class ElementData(normals: Map[String, Double], stems: Map[String, Double], lemmas: Map[String, Double]) {
+ def get(norm: String, stem: String, lemma: String): Option[Double] =
+ normals.get(norm) match {
+ case Some(v) => Some(v)
+ case None =>
+ stems.get(stem) match {
+ case Some(v) => Some(v)
+ case None => lemmas.get(lemma)
+ }
+ }
+ }
+
+ // Service which responsible for all confidences calculations.
+ private object ConfMath {
+ /**
+ * Squeeze word's confidences values list (result of corpus processing) to single value.
+ *
+ * @param confs Word's confidences values for some.
+ * @return Calculated single value. `None` means that this word shouldn't ne taken into account for element.
+ */
+ def squeeze(confs: Seq[Double]): Option[Double] = {
+ // Drops if there is not enough data.
+ // For one element we have few samples. Each word should be offered few times.
+ if (confs.length < 3)
+ None
+ else {
+ // Takes 50% of most important (or first 2 at least) and calculates average value.
+ val n = Math.max((confs.length * 0.5).intValue(), 2)
+
+ val maxN = confs.sortBy(-_).take(n)
+
+ Some(maxN.sum / maxN.length)
+ }
+ }
+
+ /**
+ * Calculates confidence values based on suggested confidence for given word and corpus confidence.
+ *
+ * @param suggConf Suggestion confidence for noun of given sentence.
+ * @param corpusConf Corpus confidence which found via suggestion, co-reference.
+ */
+ def calculate(suggConf: Double, corpusConf: Double): Double =
+ // Corpus data is more important. Empirical factors configured.
+ calcWeightedGeoMean(Map(suggConf -> 1, corpusConf -> 2))
+
+ /**
+ * Calculates weighted geometrical mean value.
+ *
+ * @param vals2Weights Values with their weights.
+ */
+ private def calcWeightedGeoMean(vals2Weights: Map[Double, Double]): Double =
+ Math.pow(vals2Weights.map { case (v, weight) => Math.pow(v, weight) }.product, 1.0 / vals2Weights.values.sum)
+ }
+
+ @volatile private var valuesStems: mutable.HashMap[ModelProbeKey, ValuesHolder] = _
+ @volatile private var elemsCorpuses: mutable.HashMap[ModelProbeKey, Map[String, ElementData]] = _
+ @volatile private var parser: NCNlpParser = _
+
+ override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ =>
+ ackStarting()
+
+ valuesStems = mutable.HashMap.empty
+ elemsCorpuses = mutable.HashMap.empty
+ parser = NCNlpServerManager.getParser
+
+ ackStarted()
+ }
+
+ override def stop(parent: Span = null): Unit =
+ startScopedSpan("stop", parent) { _ =>
+ ackStopping()
+
+ parser = null
+ elemsCorpuses = null
+ valuesStems = null
+
+ ackStopped()
+ }
+
+ /**
+ *
+ * @param seq
+ * @return
+ */
+ private def toStr(seq: Seq[String]): String = seq.mkString("{ ", ", ", " }")
+
+ /**
+ *
+ * @param s
+ * @return
+ */
+ private def norm(s: String): String = s.toLowerCase
+
+ /**
+ *
+ * @param awaitable
+ * @tparam T
+ * @return
+ */
+ private def syncExec[T](awaitable: scala.concurrent.Awaitable[T]): T = Await.result(awaitable, Duration.Inf)
+
+ /**
+ *
+ * @param corpusNlpSeq
+ * @param elemSingleVals
+ * @return
+ */
+ private def mkRequests(corpusNlpSeq: Seq[Seq[NCNlpWord]], elemSingleVals: Set[String]): Iterable[Request] =
+ corpusNlpSeq.
+ flatMap {
+ corpusNlp =>
+ lazy val corpusWords = corpusNlp.map(_.word)
+
+ def getIndexes(corpVals: Seq[String], vals: Set[String]): Set[Int] =
+ vals.flatMap(v => {
+ val i = corpVals.indexOf(v)
+
+ if (i >= 0) Some(i) else None
+ })
+
+ val elemSingleValsNorm = elemSingleVals.map(norm)
+ val elemSingleValsStem = elemSingleVals.map(stem)
+
+ val idxs =
+ getIndexes(corpusNlp.map(_.normalWord), elemSingleValsNorm) ++
+ getIndexes(corpusNlp.map(_.stem), elemSingleValsStem) ++
+ // Sample can have word in plural forms.
+ // We can compare them with synonyms values (suppose that model synonyms value defined as lemma)
+ getIndexes(corpusNlp.map(p => norm(p.lemma)), elemSingleValsNorm)
+
+ def mkRequest(idx: Int, syn: String): Request = {
+ var newSen = substitute(corpusWords, syn, idx)
+
+ val nlpWordsNew = parser.parse(newSen.mkString(" "))
+
+ require(corpusWords.size == nlpWordsNew.size)
+
+ val pos = corpusNlp(idx).pos
+ val posNew = nlpWordsNew(idx).pos
+
+ if (NOUNS_POS_SINGULAR.contains(pos) && NOUNS_POS_PLURALS.contains(posNew))
+ newSen = substitute(corpusWords, CONVERTER.depluralize(syn), idx)
+ else if (NOUNS_POS_PLURALS.contains(pos) && NOUNS_POS_SINGULAR.contains(posNew))
+ newSen = substitute(corpusWords, CONVERTER.pluralize(syn), idx)
+
+ Request(newSen, idx)
+ }
+
+ for (idx <- idxs; syn <- elemSingleVals)
+ yield mkRequest(idx, syn)
+ }
+
+ /**
+ * Context word server returned values have confidence in range (0..2).
+ *
+ * @param conf Context word server confidence value.
+ */
+ private def normalizeConf(conf: Double): Double = conf / MAX_CTXWORD_SCORE
+
+ /**
+ *
+ * @param cfg
+ * @param key
+ * @param vh
+ * @param parent
+ * @return
+ */
+ private def getCorpusData(cfg: NCCtxWordCategoriesConfigMdo, key: ModelProbeKey, vh: ValuesHolder, parent: Span = null):
+ Map[/** Element ID */String, ElementData] =
+ elemsCorpuses.synchronized { elemsCorpuses.get(key) } match {
+ case Some(cache) => cache
+ case None =>
+ val res = askSamples(cfg, vh, parent)
+
+ elemsCorpuses.synchronized { elemsCorpuses += key -> res }
+
+ res
+ }
+
+ /**
+ *
+ * @param cfg
+ * @param key
+ * @return
+ */
+ private def getValuesData(cfg: NCCtxWordCategoriesConfigMdo, key: ModelProbeKey): ValuesHolder =
+ valuesStems.synchronized { valuesStems.get(key) } match {
+ case Some(cache) => cache
+ case None =>
+ def mkMap(convert: String => String): Map[String, Set[String]] =
+ cfg.singleValues.
+ flatMap { case (elemId, vals) => vals.map { case (_, vals) => vals.map(convert(_) -> elemId) } }.
+ flatten.
+ groupBy { case (converted, _) => converted }.
+ map { case (converted, map) => converted -> map.map { case (_, elemId) => elemId }.toSet }
+
+ val normsMap = mkMap(norm)
+ val stemsMap = mkMap(stem)
+
+ val h = ValuesHolder(normal = normsMap, stems = stemsMap.filter(p => !normsMap.keySet.contains(p._1)))
+
+ valuesStems.synchronized { valuesStems += key -> h }
+
+ h
+ }
+
+ /**
+ *
+ * @param words
+ * @param word
+ * @param index
+ * @return
+ */
+ private def substitute(words: Seq[String], word: String, index: Int): Seq[String] = {
+ require(index < words.length)
+
+ words.zipWithIndex.map { case (w, i) => if (i != index) w else word }
+ }
+
+ /**
+ *
+ * @param req
+ * @param sugg
+ * @return
+ */
+ private def getLemma(req: Request, sugg: Suggestion): String =
+ parser.parse(substitute(req.words, sugg.word, req.index).mkString(" "))(req.index).lemma
+
+ /**
+ *
+ * @param cfg
+ * @param vh
+ * @param parent
+ */
+ @throws[NCE]
+ private def askSamples(cfg: NCCtxWordCategoriesConfigMdo, vh: ValuesHolder, parent: Span = null):
+ Map[/** Element ID */String, ElementData] = {
+ val corpusNlp = cfg.corpus.toSeq.map(s => parser.parse(s))
+
+ val recs: Map[String, Seq[Request]] =
+ (
+ for (
+ (elemId, elemSingleVals) <- cfg.singleValues.toSeq;
+ elemSingleValsSet = elemSingleVals.flatMap(_._2).toSet;
+ suggReq <- mkRequests(corpusNlp, elemSingleValsSet)
+ ) yield (elemId, suggReq)
+ ).
+ groupBy { case (elemId, _) => elemId }.
+ map { case (elemId, m) => elemId -> m.map(_._2) }
+
+ if (recs.nonEmpty) {
+ val respsSeq: Seq[(Request, Seq[Suggestion])] =
+ syncExec(NCSuggestSynonymManager.suggestWords(recs.flatMap(_._2).toSeq, parent = parent)).
+ toSeq.sortBy(p => (p._1.words.mkString, p._1.index))
+
+ if (DEEP_DEBUG) {
+ val t = NCAsciiTable()
+
+ t #= ("Request", "Responses")
+
+ for ((req, resp) <- respsSeq)
+ t += (req, s"${resp.map(p => s"${p.word}=${FMT.format(normalizeConf(p.score))}").mkString(", ")}")
+
+ t.trace(logger, Some("Corpus requests:"))
+ }
+
+ val req2Elem = recs.flatMap { case (elemId, recs) => recs.map(p => p -> elemId) }
+
+ def mkMap(convert: (Request, Suggestion) => String):
+ Map[/** Element ID */ String, /** Word key */ Map[String, /** Confidences */ Seq[Double]]] = {
+ val seq: Seq[(String, Map[String, Double])] =
+ respsSeq.
+ map { case (req, suggs) =>
+ (
+ req2Elem(req),
+ suggs.groupBy(sygg => convert(req, sygg)).
+ // If different word forms have different confidence (`Abc`- 0.9, `abc`- 0.7),
+ // we use maximum (0.9).
+ map { case (key, suggs) => key -> suggs.map(p => normalizeConf(p.score)).max }
+ )
+ }
+ seq.
+ groupBy { case (elemId, _) => elemId }.
+ map { case (elemId, data) =>
+ elemId ->
+ data.flatMap(_._2).
+ groupBy { case (word, _) => word }.
+ map { case (word, data) => word -> data.map { case (_, confs) => confs } }
+ }
+ }
+
+ val normals = mkMap { (_, sugg) => norm(sugg.word) }
+ val stems = mkMap { (_, sugg) => stem(sugg.word) }
+ val lemmas = mkMap { (req, sugg) => getLemma(req, sugg) }
+
+ def mkTable(): NCAsciiTable =
+ if (DEEP_DEBUG) {
+ val t = NCAsciiTable()
+
+ t #= ("Element", "Confidences for normal forms")
+
+ t
+ }
+ else
+ null
+
+ val (tabAll, tabNorm) = (mkTable(), mkTable())
+
+ val res =
+ (normals.keySet ++ stems.keySet ++ lemmas.keySet).map(elemId =>
+ elemId -> {
+ def get[T, K](m: Map[String, Map[T, K]]): Map[T, K] = m.getOrElse(elemId, Map.empty)
+
+ (get(normals), get(stems), get(lemmas))
+ }
+ ).
+ toMap.
+ map { case (elemId, (normals, stems, lemmas)) =>
+ // Skips suggestions, which already exists as values for element.
+ def dropValues[T](words: Map[String, Seq[Double]], vals: Map[String, Set[String]]):
+ Map[String, Seq[Double]] =
+ words.filter { case (word, _) => vals.get(word) match {
+ case Some(elemIds) => !elemIds.contains(elemId)
+ case None => true
+ }}
+
+ val normalsAll = dropValues(normals, vh.normal)
+ val stemsAll = dropValues(stems -- normalsAll.keySet, vh.stems)
+ val lemmasAll = lemmas -- normals.keySet -- stemsAll.keySet
+
+ def mkDebugElementCell(normsSize: Int, stemsSize: Int, lemmasSize: Int): String =
+ s"Element: $elemId [normals=$normsSize, stems=$stemsSize, lemmas=$lemmasSize]"
+
+ if (DEEP_DEBUG)
+ tabAll += (
+ mkDebugElementCell(normalsAll.size, stemsAll.size, lemmasAll.size),
+ toStr(
+ normalsAll.toSeq.
+ sortBy(p => (-p._2.max, -p._2.size)).map(
+ { case (k, confs) => s"$k=${toStr(confs.sortBy(-_).map(p => FMT.format(p)))}" }
+ )
+ )
+ )
+
+ def squeeze(map: Map[String, Seq[Double]]): Map[String, Double] =
+ map.flatMap { case (wordKey, confs) =>
+ ConfMath.squeeze(confs) match {
+ case Some(conf) => Some(wordKey -> conf)
+ case None => None
+ }
+ }
+
+ val normalsSingle = squeeze(normalsAll)
+ val stemsSingle = squeeze(stemsAll)
+ val lemmasSingle = squeeze(lemmasAll)
+
+ if (DEEP_DEBUG)
+ tabNorm += (
+ mkDebugElementCell(normalsSingle.size, stemsSingle.size, lemmasSingle.size),
+ toStr(
+ normalsSingle.toSeq.sortBy(-_._2).map(
+ { case (k, factor) => s"$k=${FMT.format(factor)}" }
+ )
+ )
+ )
+
+ elemId -> ElementData(normalsSingle, stemsSingle, lemmasSingle)
+ }
+
+ if (DEEP_DEBUG) {
+ tabAll.trace(logger, Some("Model corpus all confidences:"))
+ tabNorm.trace(logger, Some("Model corpus normalized confidences:"))
+ }
+
+ res
+ }
+ else
+ Map.empty[String, ElementData]
+ }
+
+ override def enrich(ns: NCNlpSentence, parent: Span): Unit =
+ startScopedSpan("enrich", parent) { _ =>
+ ns.ctxWordConfig match {
+ case Some(cfg) =>
+ val detected = mutable.HashMap.empty[NCNlpSentenceToken, mutable.HashSet[ElementConfidence]]
+
+ def add(nounTok: NCNlpSentenceToken, elemId: String, conf: Confidence): Unit = {
+ val tokElems = detected.getOrElseUpdate(nounTok, mutable.HashSet.empty[ElementConfidence])
+
+ tokElems.find(_.elementId == elemId) match {
+ case Some(exConf) =>
+ if (conf.value > exConf.confidence.value) {
+ tokElems += ElementConfidence(elemId, conf)
+ tokElems -= exConf
+ }
+ case None =>
+ tokElems += ElementConfidence(elemId, conf)
+ }
+ }
+
+ val nouns = ns.tokens.filter(t => NOUNS_POS.contains(t.pos))
+
+ if (nouns.nonEmpty) {
+ val key = ModelProbeKey(cfg.probeId, cfg.modelId)
+
+ // 1. Values. Direct.
+ val vh = getValuesData(cfg, key)
+
+ val (vNorms, vStems) = (vh.normal, vh.stems)
+
+ if (DEEP_DEBUG)
+ logger.trace(
+ s"Model loaded [" +
+ s"key=$key, elements: " +
+ s"${cfg.elements.mkString(", ")}, " +
+ s"values data=$vh]"
+ )
+
+ def get(m: Map[String, Set[String]], key: String): Set[String] = m.getOrElse(key, Set.empty)
+
+ for (
+ n <- nouns;
+ elemId <- get(vNorms, n.normText) ++ get(vNorms, norm(n.lemma)) ++ get(vStems, n.stem)
+ )
+ add(n, elemId, Confidence(INCL_MAX_CONFIDENCE))
+
+ // 2. Via corpus.
+ val corpusData = getCorpusData(cfg, key, vh, parent)
+
+ for (
+ nounTok <- nouns;
+ (elemId, elemData) <- corpusData;
+ confOpt = elemData.get(nounTok.normText, nounTok.stem, nounTok.lemma)
+ if confOpt.isDefined && confOpt.get >= cfg.elements(elemId)
+ )
+ add(nounTok, elemId, Confidence(confOpt.get))
+
+ // 3. Ask for sentence (via co-references)
+ val idxs = ns.tokens.flatMap(p => if (p.pos.startsWith("N")) Some(p.index) else None).toSeq
+ val reqs = idxs.map(idx => Request(ns.tokens.map(_.origText).toSeq, idx))
+
+ val resps: Map[Suggestion, Request] =
+ syncExec(NCSuggestSynonymManager.suggestWords(reqs, parent = parent)).
+ flatMap { case (req, suggs) => suggs.map(_ -> req) }
+
+ if (DEEP_DEBUG) {
+ val t = NCAsciiTable()
+
+ t #= ("Request", "Responses")
+
+ resps.toSeq.groupBy(_._2.index).foreach { case (_, seq) =>
+ val sorted = seq.sortBy(-_._1.score)
+
+ t += (
+ sorted.head._2,
+ s"${
+ sorted.map(_._1).
+ map(p => s"${p.word}=${FMT.format(normalizeConf(p.score))}").
+ mkString(", ")
+ }"
+ )
+ }
+
+ t.trace(logger, Some(s"Sentence requests processing [key=$key, sentence=${ns.text}]"))
+ }
+
+ case class Key(elementId: String, token: NCNlpSentenceToken)
+
+ val missed = if (DEEP_DEBUG) mutable.HashMap.empty[Key, ArrayBuffer[Confidence]] else null
+
+ def calcConf(elemId: String, data: ElementData, req: Request, s: Suggestion): Option[Double] = {
+ val suggNorm = norm(s.word)
+ val suggStem = stem(s.word)
+
+ if (
+ vh.normal.getOrElse(suggNorm, Set.empty).contains(elemId) ||
+ vh.stems.getOrElse(suggStem, Set.empty).contains(elemId)
+ )
+ Some(1.0)
+ else
+ data.get(norm = suggNorm, stem = suggStem, lemma = getLemma(req, s))
+ }
+
+ for (
+ // Token index (tokIdx) should be correct because request created from original words,
+ // separated by space, and Suggestion Manager uses space tokenizer.
+ (sugg, req) <- resps.toSeq.sortBy(_._2.index);
+ suggConf = normalizeConf(sugg.score);
+ (elemId, elemData) <- corpusData;
+ elemConf = cfg.elements(elemId);
+ valOrCorpConfOpt = calcConf(elemId, elemData, req, sugg)
+ if valOrCorpConfOpt.isDefined;
+ valOrCorpConf = valOrCorpConfOpt.get;
+ normConf = ConfMath.calculate(suggConf, valOrCorpConf)
+ ) {
+ def mkConf(): Confidence = Confidence(normConf, Some(Reason(sugg.word, suggConf, valOrCorpConf)))
+ def getToken: NCNlpSentenceToken = ns.tokens(req.index)
+
+ if (normConf >= elemConf)
+ add(getToken, elemId, mkConf())
+ else if (DEEP_DEBUG)
+ missed.getOrElseUpdate(Key(elemId, getToken), mutable.ArrayBuffer.empty) += mkConf()
+ }
+
+ ns.ctxWordCategories = detected.map {
+ case (tok, confs) => tok.index -> confs.map(p => p.elementId -> p.confidence.value).toMap
+ }.toMap
+
+ if (DEEP_DEBUG) {
+ require(missed != null)
+
+ missed.filter { case (key, _) =>
+ !detected.exists {
+ case (tok, confs) => confs.exists(conf => Key(conf.elementId, tok) == key)
+ }
+ }.sortBy { case (key, _) => (key.token.index, key.elementId) }.
+ foreach { case (key, confs) =>
+ logger.trace(
+ s"Unsuccessful attempt [" +
+ s"elementId=${key.elementId}, " +
+ s"tokenWordIndexes=${key.token.wordIndexes.mkString(",")}, " +
+ s"confidences=${confs.sortBy(-_.value).mkString(", ")}" +
+ s"]"
+ )
+ }
+
+ logger.trace("Sentence detected elements:")
+
+ for ((tok, elems) <- detected)
+ logger.trace(s"${tok.origText}: ${elems.mkString(", ")}")
+ }
+ }
+
+ case None => // No-op.
+ }
+ }
+
+ /**
+ *
+ * @param probeId
+ * @param parent
+ */
+ def onDisconnectProbe(probeId: String, parent: Span = null): Unit =
+ startScopedSpan("onDisconnectProbe", parent) { _ =>
+ valuesStems.synchronized { valuesStems --= valuesStems.keySet.filter(_.probeId == probeId) }
+ elemsCorpuses.synchronized { elemsCorpuses --= elemsCorpuses.keySet.filter(_.probeId == probeId) }
+ }
+}
\ No newline at end of file
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
index cd0d5a5..7f8437e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
@@ -32,8 +32,9 @@
import org.apache.nlpcraft.common.{NCService, _}
import org.apache.nlpcraft.probe.mgrs.NCProbeMessage
import org.apache.nlpcraft.server.company.NCCompanyManager
-import org.apache.nlpcraft.server.mdo.{NCCompanyMdo, NCProbeMdo, NCProbeModelMdo, NCUserMdo}
+import org.apache.nlpcraft.server.mdo._
import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnrichmentManager
+import org.apache.nlpcraft.server.nlp.enrichers.ctxword.NCContextWordCategoriesEnricher
import org.apache.nlpcraft.server.proclog.NCProcessLogManager
import org.apache.nlpcraft.server.query.NCQueryManager
import org.apache.nlpcraft.server.sql.NCSql
@@ -264,6 +265,9 @@
// Clears unused models.
mdls --= mdls.keys.filter(id => !probes.exists { case (_, p) => p.probe.models.exists(_.id == id) })
+
+ // TODO: add new interface for server enrichers? (services)
+ NCContextWordCategoriesEnricher.onDisconnectProbe(probeKey.probeId)
}
case Some(hld) =>
@@ -603,7 +607,8 @@
s"probeToken=$probeTkn, " +
s"probeId=$probeId, " +
s"proveGuid=$probeGuid" +
- s"]")
+ s"]"
+ )
if (isMultipleProbeRegistrations(probeKey))
respond("S2P_PROBE_MULTIPLE_INSTANCES")
@@ -621,7 +626,10 @@
String,
String,
java.util.Set[String],
- java.util.Set[String]
+ java.util.Set[String],
+ java.util.Map[String, java.util.Map[String, java.util.Set[String]]],
+ java.util.Set[String],
+ java.util.Map[String, Double]
)]]("PROBE_MODELS").
map {
case (
@@ -629,20 +637,44 @@
mdlName,
mdlVer,
enabledBuiltInToks,
- elmIds
+ elmIds,
+ singleValues,
+ corpus,
+ categoriesElements
) =>
require(mdlId != null)
require(mdlName != null)
require(mdlVer != null)
require(enabledBuiltInToks != null)
require(elmIds != null)
+ require(singleValues.isEmpty && corpus.isEmpty || !singleValues.isEmpty && !corpus.isEmpty)
NCProbeModelMdo(
id = mdlId,
name = mdlName,
version = mdlVer,
enabledBuiltInTokens = enabledBuiltInToks.asScala.toSet,
- elementIds = elmIds.asScala.toSet
+ elementIds = elmIds.asScala.toSet,
+ ctxWordConfig =
+ if (!singleValues.isEmpty) {
+ Some(
+ NCCtxWordCategoriesConfigMdo(
+ probeId = probeId,
+ modelId = mdlId,
+ singleValues = singleValues.asScala.map {
+ case (elemId, map) =>
+ elemId ->
+ map.asScala.map {
+ case (value, syns) => value -> syns.asScala.toSet
+ }.toMap
+ }.toMap,
+ corpus = corpus.asScala.toSet,
+ elements = categoriesElements.asScala.toMap
+ )
+ )
+ }
+ else
+ None
)
}.toSet
@@ -710,7 +742,7 @@
else
logger.warn(s"Message ignored: $probeMsg")
}
-
+
/**
* Processes the messages received from the probe.
*
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala
index a12a4e8..301bc59 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala
@@ -272,7 +272,7 @@
logger.info(s"New user request received:\n$tbl")
- val enabledBuiltInToks = NCProbeManager.getModel(mdlId, span).enabledBuiltInTokens
+ val mdl = NCProbeManager.getModel(mdlId, span)
@throws[NCE]
def unzipProperties(gzipOpt: Option[String]): Option[JavaMeta] =
@@ -288,7 +288,7 @@
company,
mdlId,
txt0,
- NCServerEnrichmentManager.enrichPipeline(srvReqId, txt0, enabledBuiltInToks),
+ NCServerEnrichmentManager.enrichPipeline(srvReqId, txt0, mdl.enabledBuiltInTokens, mdl.ctxWordConfig),
usrAgent,
rmtAddr,
data,
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/rest/NCBasicRestApi.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/rest/NCBasicRestApi.scala
index 2bbe409..3530e26 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/rest/NCBasicRestApi.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/rest/NCBasicRestApi.scala
@@ -809,7 +809,7 @@
checkModelId(req.mdlId, admUsr.companyId)
- val fut = NCSuggestSynonymManager.suggest(req.mdlId, req.minScore, span)
+ val fut = NCSuggestSynonymManager.suggestModel(req.mdlId, req.minScore, span)
successWithJs(
fut.collect {
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestSynonymManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestSynonymManager.scala
index d89ba98..20b8f70 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestSynonymManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestSynonymManager.scala
@@ -24,7 +24,7 @@
import org.apache.http.client.ResponseHandler
import org.apache.http.client.methods.HttpPost
import org.apache.http.entity.StringEntity
-import org.apache.http.impl.client.HttpClients
+import org.apache.http.impl.client.{CloseableHttpClient, HttpClients}
import org.apache.http.util.EntityUtils
import org.apache.nlpcraft.common._
import org.apache.nlpcraft.common.config.NCConfigurable
@@ -55,7 +55,7 @@
private final val MIN_CNT_MODEL = 20
private final val GSON = new Gson
- private final val TYPE_RESP = new TypeToken[util.List[util.List[Suggestion]]]() {}.getType
+ private final val TYPE_RESP = new TypeToken[util.List[util.List[NCWordSuggestion]]]() {}.getType
private final val SEPARATORS = Seq('?', ',', '.', '-', '!')
private implicit final val ec: ExecutionContext = NCThreadPoolManager.getSystemContext
@@ -64,7 +64,7 @@
val urlOpt: Option[String] = getStringOpt("nlpcraft.server.ctxword.url")
}
- private final val HANDLER: ResponseHandler[Seq[Seq[Suggestion]]] =
+ private final val HANDLER: ResponseHandler[Seq[Seq[NCWordSuggestion]]] =
(resp: HttpResponse) => {
val code = resp.getStatusLine.getStatusCode
val e = resp.getEntity
@@ -76,9 +76,9 @@
code match {
case 200 =>
- val data: util.List[util.List[Suggestion]] = GSON.fromJson(js, TYPE_RESP)
+ val data: util.List[util.List[NCWordSuggestion]] = GSON.fromJson(js, TYPE_RESP)
- data.asScala.map(p => if (p.isEmpty) Seq.empty else p.asScala.tail.toSeq).toSeq
+ data.asScala.map(p => if (p.isEmpty) Seq.empty else p.asScala.toSeq).toSeq
case _ =>
throw new NCE(
@@ -90,9 +90,14 @@
}
}
- case class Suggestion(word: String, score: Double)
+
case class RequestData(sentence: String, ex: String, elmId: String, index: Int)
case class RestRequestSentence(text: String, indexes: util.List[Int])
+ object RestRequestSentence {
+ def apply(text: String, index: Int): RestRequestSentence = new RestRequestSentence(text, Seq(index).asJava)
+
+
+ }
case class RestRequest(sentences: util.List[RestRequestSentence], limit: Int, minScore: Double)
case class Word(word: String, stem: String) {
require(!word.contains(" "), s"Word cannot contains spaces: $word")
@@ -111,6 +116,19 @@
private def toStem(s: String): String = split(s).map(NCNlpPorterStemmer.stem).mkString(" ")
private def toStemWord(s: String): String = NCNlpPorterStemmer.stem(s)
+ @throws[NCE]
+ private def mkUrl = s"${Config.urlOpt.getOrElse(throw new NCE("Context word server is not configured."))}/suggestions"
+
+ private def request(cli: CloseableHttpClient, post: HttpPost): Seq[Seq[NCWordSuggestion]] = {
+ val resps: Seq[Seq[NCWordSuggestion]] =
+ try
+ cli.execute(post, HANDLER)
+ finally
+ post.releaseConnection()
+
+ resps
+ }
+
/**
*
* @param seq1
@@ -131,14 +149,14 @@
}
/**
- *
+ * TODO: refactor async call (waiting should be dropped.)
* @param mdlId
* @param minScoreOpt
* @param parent
* @return
*/
- def suggest(mdlId: String, minScoreOpt: Option[Double], parent: Span = null): Future[NCSuggestSynonymResult] =
- startScopedSpan("inspect", parent, "mdlId" -> mdlId) { _ =>
+ def suggestModel(mdlId: String, minScoreOpt: Option[Double], parent: Span = null): Future[NCSuggestSynonymResult] =
+ startScopedSpan("suggestModel", parent, "mdlId" -> mdlId) { _ =>
val now = U.now()
val promise = Promise[NCSuggestSynonymResult]()
@@ -178,7 +196,7 @@
if (mdlExs.isEmpty)
onError(s"Missed intents samples for: `$mdlId``")
else {
- val url = s"${Config.urlOpt.getOrElse(throw new NCE("Context word server is not configured."))}/suggestions"
+ val url = mkUrl
val allSamplesCnt = mdlExs.map { case (_, samples) => samples.size }.sum
@@ -281,9 +299,9 @@
if (allReqsCnt == 0)
onError(s"Suggestions cannot be generated for model: '$mdlId'")
else {
- val allSgsts = new ConcurrentHashMap[String, util.List[Suggestion]]()
+ val allSgsts = new ConcurrentHashMap[String, util.List[NCWordSuggestion]]()
val cdl = new CountDownLatch(1)
- val debugs = mutable.HashMap.empty[RequestData, Seq[Suggestion]]
+ val debugs = mutable.HashMap.empty[RequestData, Seq[NCWordSuggestion]]
val cnt = new AtomicInteger(0)
val cli = HttpClients.createDefault
@@ -299,7 +317,7 @@
new StringEntity(
GSON.toJson(
RestRequest(
- sentences = batch.map(p => RestRequestSentence(p.sentence, Seq(p.index).asJava)).asJava,
+ sentences = batch.map(p => RestRequestSentence(p.sentence, p.index)).asJava,
minScore = 0,
limit = MAX_LIMIT
)
@@ -308,10 +326,7 @@
)
)
- val resps: Seq[Seq[Suggestion]] = try
- cli.execute(post, HANDLER)
- finally
- post.releaseConnection()
+ val resps = request(cli, post)
require(batch.size == resps.size, s"Batch: ${batch.size}, responses: ${resps.size}")
@@ -322,7 +337,7 @@
logger.debug(s"Executed: $i requests...")
allSgsts.
- computeIfAbsent(elmId, (_: String) => new CopyOnWriteArrayList[Suggestion]()).
+ computeIfAbsent(elmId, (_: String) => new CopyOnWriteArrayList[NCWordSuggestion]()).
addAll(resps.flatten.asJava)
if (i == allReqsCnt)
@@ -441,6 +456,74 @@
}
/**
+ *
+ * @param reqs
+ * @param minScoreOpt
+ * @param parent
+ * @return
+ */
+ def suggestWords(reqs: Seq[NCSuggestionRequest], minScoreOpt: Option[Double] = None, parent: Span = null):
+ Future[Map[NCSuggestionRequest, Seq[NCWordSuggestion]]] =
+ startScopedSpan("suggestWords", parent) { _ =>
+ val promise = Promise[Map[NCSuggestionRequest, Seq[NCWordSuggestion]]]()
+
+ case class Result(request: NCSuggestionRequest, suggestions: Seq[NCWordSuggestion])
+
+ val data = new CopyOnWriteArrayList[Result]()
+ val cli = HttpClients.createDefault
+ val batches = reqs.sliding(BATCH_SIZE, BATCH_SIZE).map(_.toSeq).toSeq
+ val cnt = new AtomicInteger(0)
+
+ for (batch <- batches)
+ U.asFuture(
+ _ => {
+ val post = new HttpPost(mkUrl)
+
+ post.setHeader("Content-Type", "application/json")
+ post.setEntity(
+ new StringEntity(
+ GSON.toJson(
+ RestRequest(
+ sentences = batch.map(p => RestRequestSentence(p.words.mkString(" "), p.index)).asJava,
+ minScore = 0,
+ limit = MAX_LIMIT
+ )
+ ),
+ "UTF-8"
+ )
+ )
+
+ val resps = request(cli, post)
+
+ require(batch.size == resps.size, s"Batch: ${batch.size}, responses: ${resps.size}")
+
+ data.addAll(batch.zip(resps).map { case (req, resp) => Result(req, resp) }.asJava )
+
+ if (cnt.incrementAndGet() == batches.size) {
+ val min = minScoreOpt.getOrElse(DFLT_MIN_SCORE)
+
+ promise.success(
+ data.asScala.groupBy(_.request).map {
+ case (req, ress) =>
+ req -> ress.flatMap(_.suggestions.filter(_.score >= min).toSeq).sortBy(-_.score)
+ }
+ )
+ }
+ ()
+ },
+ (e: Throwable) => {
+ U.prettyError(logger, "Unexpected error:", e)
+
+ promise.failure(e)
+
+ },
+ (_: Unit) => ()
+ )
+
+ promise.future
+ }
+
+ /**
*
* @param parent Optional parent span.
* @return
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestionRequest.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestionRequest.scala
new file mode 100644
index 0000000..108a5f8
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCSuggestionRequest.scala
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.server.sugsyn
+
+/**
+ *
+ * @param words
+ * @param index
+ */
+case class NCSuggestionRequest(words: Seq[String], index: Int) {
+ require(index >= 0 && index < words.length)
+
+ override def toString: String =
+ s"Request: ${words.zipWithIndex.map { case (w, i) => if (i != index) w else s"<$w>" }.mkString(" ")}"
+}
+
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCWordSuggestion.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCWordSuggestion.scala
new file mode 100644
index 0000000..a09b2ca
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/sugsyn/NCWordSuggestion.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.server.sugsyn
+
+/**
+ *
+ * @param word
+ * @param score
+ */
+case class NCWordSuggestion(word: String, score: Double)
\ No newline at end of file
diff --git a/nlpcraft/src/test/resources/log4j2.xml b/nlpcraft/src/test/resources/log4j2.xml
index d9a627b..44590c3 100644
--- a/nlpcraft/src/test/resources/log4j2.xml
+++ b/nlpcraft/src/test/resources/log4j2.xml
@@ -36,7 +36,7 @@
<AppenderRef ref="stdout"/>
<AppenderRef ref="stderr"/>
</Root>
- <Logger name="org.apache.nlpcraft" level="INFO" additivity="false">
+ <Logger name="org.apache.nlpcraft" level="${env:NLPCRAFT_LOG_LEVEL:-INFO}" additivity="false">
<AppenderRef ref="stdout"/>
<AppenderRef ref="stderr"/>
</Logger>
diff --git a/nlpcraft/src/test/resources/org/apache/nlpcraft/model/ctxword/lightswitch_model2.yaml b/nlpcraft/src/test/resources/org/apache/nlpcraft/model/ctxword/lightswitch_model2.yaml
new file mode 100644
index 0000000..3b4e07b
--- /dev/null
+++ b/nlpcraft/src/test/resources/org/apache/nlpcraft/model/ctxword/lightswitch_model2.yaml
@@ -0,0 +1,92 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+id: "nlpcraft.lightswitch.ex2"
+name: "Light Switch Example Model 2"
+version: "1.0"
+description: "NLI-powered light switch example model 2."
+macros:
+ - name: "<ACTION>"
+ macro: "{turn|switch|dial|let|set|get|put}"
+ - name: "<KILL>"
+ macro: "{shut|kill|stop|eliminate}"
+enabledBuiltInTokens: [] # This example doesn't use any built-in tokens.
+permutateSynonyms: true
+abstractTokens:
+ - "ls:part:place"
+ - "ls:part:placeFloor"
+ - "ls:part:placeType"
+ - "ls:part:light"
+sparse: true
+elements:
+ - id: "ls:part:place"
+ description: "Abstract element. Used for top level element `ls:loc`"
+ # TODO: Value set for examples set.
+ categoryConfidence: 0.65
+ values:
+ - name: "room"
+ - name: "bedroom"
+
+ # For simplifying example, concrete floor type can be recognized by these synonyms words.
+ - id: "ls:part:placeFloor"
+ description: "Abstract element. Used for top level element `ls:loc`"
+ synonyms:
+ - "{upstairs|downstairs|{1st|first|2nd|second|3rd|third|4th|5th|top|ground} floor|_}"
+
+ # For simplifying example, concrete place type can be recognized by these synonyms words.
+ - id: "ls:part:placeType"
+ description: "Abstract element. Used for top level element `ls:loc`"
+ synonyms:
+ - "{dinning|laundry|play|master|kid|children|child|guest}"
+
+ - id: "ls:part:light"
+ description: "Abstract element. Used for top level elements `ls:on` and `ls:of`"
+ synonyms:
+ - "{light|illumination|lamp|lamplight}"
+
+ - id: "ls:loc"
+ description: "Top level element. Used in intents.`"
+ synonyms:
+ # Parts can be extracted from `ls:loc` to specify certain location point.
+ # Part `ls:part:place` is mandatory.
+ # Parts `ls:part:placeFloor` and `ls:part:placeType` are optional.
+ - "{^^{tok_id() == 'ls:part:placeFloor'}^^|_} ^^{tok_id() == 'ls:part:place'}^^ {^^{tok_id() == 'ls:part:placeType'}^^|_}"
+ - "{^^{tok_id() == 'ls:part:placeFloor'}^^|_} {^^{tok_id() == 'ls:part:placeType'}^^|_} ^^{tok_id() == 'ls:part:place'}^^"
+
+ - id: "ls:on"
+ groups:
+ - "act"
+ description: "Light switch ON action.`"
+ synonyms:
+ # It's parts help to catch this element, after they can be ignored.
+ - "<ACTION> {on|up|_} ^^{tok_id() == 'ls:part:light'}^^ {on|up|_}"
+ - "^^{tok_id() == 'ls:part:light'}^^ {on|up}"
+
+ - id: "ls:off"
+ groups:
+ - "act"
+ description: "Light switch OFF action.`"
+ synonyms:
+ # It's parts help to catch this element, after they can be ignored.
+ - "<ACTION> ^^{tok_id() == 'ls:part:light'}^^ {off|out}"
+ - "{<ACTION>|<KILL>} {off|out} ^^{tok_id() == 'ls:part:light'}^^"
+ - "<KILL> ^^{tok_id() == 'ls:part:light'}^^"
+ - "^^{tok_id() == 'ls:part:light'}^^ <KILL>"
+ - "no ^^{tok_id() == 'ls:part:light'}^^"
+
+intents:
+ - "intent=ls term(act)={has(tok_groups(), 'act')} term(loc)={tok_id() == 'ls:loc'}"
\ No newline at end of file
diff --git a/nlpcraft/src/test/resources/samples.txt b/nlpcraft/src/test/resources/org/apache/nlpcraft/model/samples.txt
similarity index 100%
rename from nlpcraft/src/test/resources/samples.txt
rename to nlpcraft/src/test/resources/org/apache/nlpcraft/model/samples.txt
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/NCIntentSampleSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/NCIntentSampleSpec.scala
index 2efbbd2..5ac2f64 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/NCIntentSampleSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/NCIntentSampleSpec.scala
@@ -39,7 +39,8 @@
@NCIntentSample(Array("unknown", "unknown"))
private def onX1(ctx: NCIntentMatch): NCResult = "OK"
- @NCIntentSampleRef("samples.txt")
+ // Look at resources folder.
+ @NCIntentSampleRef("org/apache/nlpcraft/model/samples.txt")
@NCIntent("intent=intent2 term~{tok_id()=='x2'}")
private def onX2(ctx: NCIntentMatch): NCResult = "OK"
}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
new file mode 100644
index 0000000..e9f28f2
--- /dev/null
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.model.ctxword
+
+import org.apache.nlpcraft.model.{NCContext, NCElement, NCIntent, NCIntentSample, NCModel, NCResult, NCValue}
+import org.apache.nlpcraft.{NCTestContext, NCTestEnvironment}
+import org.junit.jupiter.api.Test
+
+import java.util.{Collections, Optional}
+import java.{lang, util}
+import scala.collection.mutable.ArrayBuffer
+import scala.jdk.CollectionConverters.{CollectionHasAsScala, SeqHasAsJava, SetHasAsJava}
+
+object NCContextWordSpecModel {
+ case class Value(name: String, syns: String*) extends NCValue {
+ override def getName: String = name
+ override def getSynonyms: util.List[String] = (Seq(name) ++ syns).asJava
+ }
+
+ case class Element(id: String, level: Double, values: NCValue*) extends NCElement {
+ override def getId: String = id
+ override def getValues: util.List[NCValue] = values.asJava
+ override def getGroups: util.List[String] = Collections.singletonList("testGroup")
+ override def getCategoryConfidence: Optional[lang.Double] = Optional.of(level)
+ }
+
+ var expected: String = _
+}
+
+import org.apache.nlpcraft.model.ctxword.NCContextWordSpecModel._
+
+class NCContextWordSpecModel extends NCModel {
+ override def getId: String = this.getClass.getSimpleName
+ override def getName: String = this.getClass.getSimpleName
+ override def getVersion: String = "1.0.0"
+
+ // Empirical detected confidence for given model and requests.
+ val MDL_LEVEL: java.lang.Double = 0.68
+
+ @NCIntentSample(
+ Array(
+ "I like drive my new BMW",
+ "BMW has the best engine",
+ "Luxury cars like Mercedes and BMW are prime targets",
+ "BMW will install side air bags up front",
+ "I want to change BMW engine",
+ "I want to try BMW driver dynamics",
+ "BMW has excellent driver protection",
+ "BMW pricing are going up",
+ "BMW drivers have the highest loyalty",
+
+ "A wild cat is very dangerous",
+ "A fox eat hens",
+ "The fox was already in your chicken house",
+
+ "What is the local temperature?",
+ "This is the first day of heavy rain",
+ "It is the beautiful day, the sun is shining"
+ )
+ )
+ @NCIntent("intent=i term(t)={false}")
+ def x(): NCResult = NCResult.text("OK")
+
+ override def getElements: util.Set[NCElement] =
+ Set(
+ Element("class:cars", MDL_LEVEL, Value("BMW")),
+ Element("class:animal", MDL_LEVEL, Value("fox"), Value("cat", "tomcat")),
+ Element("class:weather", MDL_LEVEL, Value("temperature"), Value("rain"), Value("sun"))
+ ).map(p => {
+ val e: NCElement = p
+
+ e
+ }).asJava
+
+ override def onContext(ctx: NCContext): NCResult = {
+ val varRes = ArrayBuffer.empty[String]
+
+ require(ctx.getVariants.size() == 1)
+
+ val v = ctx.getVariants.asScala.head
+
+ val testGroupToks = v.asScala.toSeq.filter(_.getGroups.contains("testGroup"))
+
+ val elemIds = testGroupToks.map(_.getId).distinct.mkString(" ")
+ val words = testGroupToks.map(_.getOriginalText).mkString(" ")
+
+ val res =
+ if (NCContextWordSpecModel.expected == s"$elemIds $words")
+ "OK"
+ else
+ s"ERROR: variant '${NCContextWordSpecModel.expected}' not found. Found: ${varRes.mkString(", ")}"
+
+ NCResult.text(res)
+ }
+
+ override def getEnabledBuiltInTokens: util.Set[String] = Collections.emptySet()
+}
+
+/**
+ * @see NCConversationSpecModel
+ */
+@NCTestEnvironment(model = classOf[NCContextWordSpecModel], startClient = true)
+class NCContextWordSpec extends NCTestContext {
+ private def checkSingleVariant(txt: String, elemId: String, words: String*): Unit = {
+ NCContextWordSpecModel.expected = s"$elemId ${words.mkString(" ")}"
+
+ val res = getClient.ask(txt).getResult.get()
+
+ require(res == "OK", s"Unexpected: $res")
+ }
+
+ @Test
+ private[ctxword] def test(): Unit = {
+ checkSingleVariant("I want to have dogs and foxes", "class:animal", "dogs", "foxes")
+ checkSingleVariant("I bought dog's meat", "class:animal", "dog")
+ checkSingleVariant("I bought meat dog's", "class:animal", "dog")
+
+ checkSingleVariant("I want to have a dog and fox", "class:animal", "dog", "fox")
+ checkSingleVariant("I fed your fish", "class:animal", "fish")
+
+ checkSingleVariant("I like to drive my Porsche and Volkswagen", "class:cars", "Porsche", "Volkswagen")
+ checkSingleVariant("Peugeot added motorcycles to its range year ago", "class:cars", "Peugeot")
+
+ checkSingleVariant("The frost is possible today", "class:weather", "frost")
+ checkSingleVariant("There's a very strong wind from the east now", "class:weather", "wind")
+ }
+}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec2.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec2.scala
new file mode 100644
index 0000000..57b41e3
--- /dev/null
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec2.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.model.ctxword
+
+import org.apache.nlpcraft.model.{NCContext, NCResult}
+import org.apache.nlpcraft.{NCTestContext, NCTestEnvironment}
+import org.junit.jupiter.api.Test
+
+/**
+ * Test model.
+ */
+class NCContextWordSpecModel2 extends NCContextWordSpecModel {
+ override val MDL_LEVEL = 0
+ override def onContext(ctx: NCContext): NCResult = NCResult.text("OK")
+}
+
+/**
+ * Run this test just for manual review all found categories for given model.
+ * Note that initial confidence set as zero.
+ */
+@NCTestEnvironment(model = classOf[NCContextWordSpecModel2], startClient = true)
+class NCContextWordSpec2 extends NCTestContext {
+ @Test
+ private[ctxword] def test(): Unit =
+ Seq(
+ "I want to have dogs and foxes",
+ "I bought dog's meat",
+ "I bought meat dog's",
+
+ "I want to have a dog and fox",
+ "I fed your fish",
+
+ "I like to drive my Porsche and Volkswagen",
+ "Peugeot added motorcycles to its range year ago",
+
+ "The frost is possible today",
+ "There's a very strong wind from the east now"
+ ).foreach(getClient.ask)
+}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCLightSwitchScalaModel2Spec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCLightSwitchScalaModel2Spec.scala
new file mode 100644
index 0000000..57b076c
--- /dev/null
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCLightSwitchScalaModel2Spec.scala
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.model.ctxword
+
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import org.apache.nlpcraft.model.tools.test.NCTestAutoModelValidator
+import org.apache.nlpcraft.model.{NCIntentRef, NCIntentSample, NCIntentTerm, NCModelFileAdapter, NCResult, NCToken}
+import org.apache.nlpcraft.{NCTestContext, NCTestEnvironment}
+import org.junit.jupiter.api.{Assertions, Test}
+
+import scala.jdk.CollectionConverters.ListHasAsScala
+
+object NCContextWordSpecModel3Data {
+ final val MAPPER = new ObjectMapper()
+
+ MAPPER.registerModule(DefaultScalaModule)
+}
+
+case class NCContextWordSpecModel3Data(
+ action: String,
+ place: String,
+ placeType: Option[String] = None,
+ placeFloor: Option[String] = None,
+ placeConfidence: java.lang.Double = 0
+)
+
+import org.apache.nlpcraft.model.ctxword.NCContextWordSpecModel3Data._
+
+/**
+ * Test model.
+ */
+class NCLightSwitchScalaModel2 extends NCModelFileAdapter("org/apache/nlpcraft/model/ctxword/lightswitch_model2.yaml") {
+ @NCIntentRef("ls")
+ @NCIntentSample(Array(
+ "Turn the lights off in the room.",
+ "Set the lights on in in the room.",
+ "Lights up in the kitchen.",
+ "Please, put the light out in the upstairs bedroom.",
+ "Turn the lights off in the guest bedroom.",
+ "No lights in the first floor guest washroom, please.",
+ "Light up the garage, please!",
+ "Kill the illumination now second floor kid closet!"
+ ))
+ def onMatch(@NCIntentTerm("act") actTok: NCToken, @NCIntentTerm("loc") locTok: NCToken): NCResult = {
+ def getPart(id: String): NCToken =
+ locTok.getPartTokens.asScala.find(_.getId == id).
+ getOrElse(throw new AssertionError(s"Token not found: $id"))
+ def getPartTextOpt(id: String): Option[String] = locTok.getPartTokens.asScala.find(_.getId == id) match {
+ case Some(t) => Some(t.getOriginalText.toLowerCase)
+ case None => None
+ }
+
+ val place = getPart("ls:part:place")
+ val conf: Double = place.meta("ls:part:place:confidence")
+
+ NCResult.json(
+ MAPPER.writeValueAsString(
+ NCContextWordSpecModel3Data(
+ action = if (actTok.getId == "ls:on") "on" else "off",
+ place = place.getOriginalText.toLowerCase,
+ placeConfidence = conf,
+ placeType = getPartTextOpt("ls:part:placeType"),
+ placeFloor = getPartTextOpt("ls:part:placeFloor")
+ )
+ )
+ )
+ }
+}
+
+/**
+ * Verifies samples set.
+ */
+class NCLightSwitchScalaModel2SpecSamples {
+ @Test
+ private[ctxword] def testSamplesStandard(): Unit = {
+ System.setProperty("NLPCRAFT_TEST_MODELS", classOf[NCLightSwitchScalaModel2].getName)
+
+ Assertions.assertTrue(NCTestAutoModelValidator.isValid(),"See error logs above.")
+ }
+}
+
+/**
+ * Extra values set.
+ */
+@NCTestEnvironment(model = classOf[NCLightSwitchScalaModel2], startClient = true)
+class NCLightSwitchScalaModel2Spec extends NCTestContext {
+ import org.apache.nlpcraft.model.ctxword.{NCContextWordSpecModel3Data => R}
+
+ private def check(testsData: (String, NCContextWordSpecModel3Data)*): Unit = {
+ val errs = collection.mutable.HashMap.empty[String, String]
+ val okMsgs = collection.mutable.ArrayBuffer.empty[String]
+
+ testsData.foreach { case (txt, expected) =>
+ def addError(msg: String): Unit = errs += txt -> msg
+
+ val res = getClient.ask(txt)
+
+ if (!res.isOk)
+ addError(res.getResultError.get())
+ else {
+ val actual = MAPPER.readValue(res.getResult.get(), classOf[R])
+
+ def getMainData(d: NCContextWordSpecModel3Data): String =
+ s"Main [action=${d.action}, place=${d.place}, placeType=${d.placeType}, placeFloor=${d.placeFloor}]"
+
+ val actualData = getMainData(actual)
+ val expData = getMainData(expected)
+
+ if (expData != actualData)
+ addError(s"Expected: $expData, actual: $actualData")
+ else
+ okMsgs += s"`$txt` processed ok with detected place `${actual.place}` and confidence `${actual.placeConfidence}`."
+ }
+ }
+
+ println(s"Test passed: ${okMsgs.size}")
+ println(s"Test errors: ${errs.size}")
+
+ okMsgs.foreach(println)
+
+ if (errs.nonEmpty)
+ throw new AssertionError(errs.mkString("\n"))
+ }
+
+ /**
+ * `ls:part:place` has 2 values: room and bedroom.
+ * Samples contains also: kitchen, washroom, garage, closet. These words detected with some confidence < 1.
+ */
+ @Test
+ def testSamplesDetailed(): Unit =
+ check(
+ "Turn the lights off in the room." ->
+ R(action = "off", place = "room"),
+ "Set the lights on in in the room." ->
+ R(action = "on", place = "room"),
+ "Lights up in the kitchen." ->
+ R(action = "on", place = "kitchen"),
+ "Please, put the light out in the upstairs bedroom." ->
+ R(action = "off", place = "bedroom", placeFloor = Some("upstairs")),
+ "Turn the lights off in the guest bedroom." ->
+ R(action = "off", place = "bedroom", placeType = Some("guest")),
+ "No lights in the first floor guest washroom, please." ->
+ R(action = "off", place = "washroom", placeType = Some("guest"), placeFloor = Some("first floor")),
+ "Light up the garage, please!" ->
+ R(action = "on", place = "garage"),
+ "Kill the illumination now second floor kid closet!" ->
+ R(action = "off", place = "closet", placeType = Some("kid"), placeFloor = Some("second floor"))
+ )
+
+ /**
+ * `ls:part:place` has 2 values: room and bedroom.
+ * Samples contains also: loft, hallway, library, chamber, office.
+ * Note, that These words are not provided as in samples.
+ * These words detected with some confidence < 1.
+ */
+ @Test
+ def testSynonymsSameCategory(): Unit =
+ check(
+ "Turn the lights off in the loft." ->
+ R(action = "off", place = "loft"),
+ "Set the lights on in in the loft." ->
+ R(action = "on", place = "loft"),
+ "Lights up in the hallway." ->
+ R(action = "on", place = "hallway"),
+ "Please, put the light out in the upstairs library." ->
+ R(action = "off", place = "library", placeFloor = Some("upstairs")),
+ "Turn the lights off in the guest office." ->
+ R(action = "off", place = "office", placeType = Some("guest")),
+ "No lights in the first floor guest chamber, please." ->
+ R(action = "off", place = "chamber", placeType = Some("guest"), placeFloor = Some("first floor")),
+ "Light up the office, please!" ->
+ R(action = "on", place = "office"),
+ "Kill the illumination now second floor kid chamber!" ->
+ R(action = "off", place = "chamber", placeType = Some("kid"), placeFloor = Some("second floor"))
+ )
+}
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index f06819c..0a459a1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -156,6 +156,7 @@
<lightstep.grpc.ver>0.15.8</lightstep.grpc.ver>
<junit.ver>5.5.1</junit.ver>
<jsonpath.ver>2.4.0</jsonpath.ver>
+ <jibx.tools.ver>1.3.3</jibx.tools.ver>
<!-- Force specific encoding on text resources. -->
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
@@ -476,6 +477,12 @@
</dependency>
<dependency>
+ <groupId>org.jibx</groupId>
+ <artifactId>jibx-tools</artifactId>
+ <version>${jibx.tools.ver}</version>
+ </dependency>
+
+ <dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>${stanford.corenlp.ver}</version>