blob: 1f8171102e55ecc123448cce9aa043fa105223ff [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
import io.opencensus.trace.Span
import org.apache.nlpcraft.common._
import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
import org.apache.nlpcraft.model._
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCTokenPartKey, NCProbeSynonym => Synonym}
import java.io.Serializable
import java.util.{List => JList}
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.CollectionConverters._
import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, MapHasAsScala, SeqHasAsJava}
/**
* Model elements enricher.
*/
object NCModelEnricher extends NCProbeEnricher {
type TokType = (NCToken, NCSynonymChunkKind)
type Cache = mutable.Map[String, ArrayBuffer[Seq[Int]]]
object Complex {
def apply(t: NCToken): Complex =
Complex(
data = Left(t),
isToken = true,
isWord = false,
token = t,
word = null,
origText = t.origText,
wordIndexes = t.wordIndexes.toSet,
minIndex = t.wordIndexes.head,
maxIndex = t.wordIndexes.last
)
def apply(t: NlpToken): Complex =
Complex(
data = Right(t),
isToken = false,
isWord = true,
token = null,
word = t,
origText = t.origText,
wordIndexes = t.wordIndexes.toSet,
minIndex = t.wordIndexes.head,
maxIndex = t.wordIndexes.last
)
}
case class Complex(
data: NCIdlContent,
isToken: Boolean,
isWord: Boolean,
token: NCToken,
word: NlpToken,
origText: String,
wordIndexes: Set[Int],
minIndex: Int,
maxIndex: Int
) {
private final val hash = if (isToken) Seq(wordIndexes, token.getId).hashCode() else wordIndexes.hashCode()
override def hashCode(): Int = hash
def isSubsetOf(minIndex: Int, maxIndex: Int, indexes: Set[Int]): Boolean =
if (this.minIndex > maxIndex || this.maxIndex < minIndex)
false
else
wordIndexes.subsetOf(indexes)
override def equals(obj: Any): Boolean = obj match {
case x: Complex =>
hash == x.hash && (isToken && x.isToken && token == x.token || isWord && x.isWord && word == x.word)
case _ => false
}
// Added for debug reasons.
override def toString: String = {
val idxs = wordIndexes.mkString(",")
if (isToken && token.getId != "nlpcraft:nlp") s"'$origText' (${token.getId}) [$idxs]]" else s"'$origText' [$idxs]"
}
}
object ComplexSeq {
def apply(all: Seq[Complex]): ComplexSeq = ComplexSeq(all.filter(_.isToken), all.flatMap(_.wordIndexes).toSet)
}
case class ComplexSeq(tokensComplexes: Seq[Complex], wordsIndexes: Set[Int]) {
private val (idxsSet: Set[Int], minIndex: Int, maxIndex: Int) = {
val seq = tokensComplexes.flatMap(_.wordIndexes).distinct.sorted
(seq.toSet, seq.head, seq.last)
}
def isIntersect(minIndex: Int, maxIndex: Int, idxsSet: Set[Int]): Boolean =
if (this.minIndex > maxIndex || this.maxIndex < minIndex)
false
else
this.idxsSet.exists(idxsSet.contains)
override def toString: String = tokensComplexes.mkString(" | ")
}
case class ComplexHolder(complexesWords: Seq[Complex], complexes: Seq[ComplexSeq])
/**
*
* @param parent Optional parent span.
* @return
*/
override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ =>
ackStarting()
ackStarted()
}
/**
*
* @param parent Optional parent span.
*/
override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ =>
ackStopping()
ackStopped()
}
def isComplex(mdl: NCProbeModel): Boolean = mdl.hasIdlSynonyms || !mdl.model.getParsers.isEmpty
/**
*
* @param ns
* @param elem
* @param toks
* @param direct
* @param syn
* @param parts
* @param metaOpt
*/
private def mark(
ns: Sentence,
elem: NCElement,
toks: Seq[NlpToken],
direct: Boolean,
syn: Option[Synonym] = None,
parts: Seq[TokType] = Seq.empty,
metaOpt: Option[Map[String, Object]] = None
): Unit = {
val params = mutable.ArrayBuffer.empty[(String, AnyRef)]
// For system elements.
params += "direct" -> direct.asInstanceOf[AnyRef]
syn match {
case Some(s) =>
if (s.isValueSynonym)
params += "value" -> s.value
case None => // No-op.
}
metaOpt match {
case Some(meta) => params += "meta" -> meta
case None => // No-op.
}
if (parts.nonEmpty)
params += "parts" -> parts.map { case (p, kind) => NCTokenPartKey(p, kind) }.asJava
val idxs = toks.map(_.index).sorted
val note = NlpNote(idxs, elem.getId, params: _*)
toks.foreach(_.add(note))
// For NLP elements.
toks.foreach(t => ns.fixNote(t.getNlpNote, "direct" -> direct))
}
/**
*
* @param mdl
* @param ns
* @param span
* @param req
*/
private def processParsers(mdl: NCProbeModel, ns: Sentence, span: Span, req: NCRequestImpl): Unit = {
for (parser <- mdl.model.getParsers.asScala) {
parser.onInit()
startScopedSpan("customParser", span,
"srvReqId" -> ns.srvReqId,
"mdlId" -> mdl.model.getId,
"txt" -> ns.text
) { _ =>
def to(t: NlpToken): NCCustomWord =
new NCCustomWord {
override def getNormalizedText: String = t.normText
override def getOriginalText: String = t.origText
override def getStartCharIndex: Int = t.startCharIndex
override def getEndCharIndex: Int = t.endCharIndex
override def getPos: String = t.pos
override def getPosDescription: String = t.posDesc
override def getLemma: String = t.lemma
override def getStem: String = t.stem
override def isStopWord: Boolean = t.isStopWord
override def isBracketed: Boolean = t.isBracketed
override def isQuoted: Boolean = t.isQuoted
override def isKnownWord: Boolean = t.isKnownWord
override def isSwearWord: Boolean = t.isSwearWord
override def isEnglish: Boolean = t.isEnglish
}
val res = parser.parse(
req,
mdl.model,
ns.map(to).asJava,
ns.flatten.distinct.filter(!_.isNlp).map(n => {
val noteId = n.noteType
val words = ns.filter(t => n.tokenIndexes.contains(t.index)).map(to).asJava
val md = n.asMetadata()
new NCCustomElement() {
override def getElementId: String = noteId
override def getWords: JList[NCCustomWord] = words
override def getMetadata: JavaMeta = md.map(p => p._1 -> p._2.asInstanceOf[AnyRef]).asJava
}
}).asJava
)
if (res != null)
res.asScala.foreach(e => {
val elmId = e.getElementId
val words = e.getWords
if (elmId == null)
throw new NCE(s"Custom model parser cannot return 'null' element ID.")
if (words == null || words.isEmpty)
throw new NCE(s"Custom model parser cannot return empty custom tokens for element: $elmId")
val matchedToks = words.asScala.map(w =>
ns.find(t =>
t.startCharIndex == w.getStartCharIndex && t.endCharIndex == w.getEndCharIndex
).getOrElse(throw new AssertionError(s"Custom model parser returned an invalid custom token: $w"))
)
// Checks element's tokens.
if (!alreadyMarked(ns, elmId, matchedToks, matchedToks.map(_.index).sorted))
mark(
ns,
elem = mdl.elements.getOrElse(elmId, throw new NCE(s"Custom model parser returned unknown element: $elmId")),
toks = matchedToks,
direct = true,
metaOpt = Some(e.getMetadata.asScala.toMap)
)
})
}
parser.onDiscard()
}
}
/**
* Gets all sequential permutations of given tokens.
*
* For example, if buffer contains "a b c d" tokens, then this function will return the
* sequence of following token sequences in this order:
* "a b c d"
* "a b c"
* "b c d"
* "a b"
* "b c"
* "c d"
* "a"
* "b"
* "c"
* "d"
*
* @param toks
* @return
*/
private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
(for (n <- toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p => p)
/**
*
* @param seq
* @param s
*/
private def toParts(seq: Seq[NCIdlContent], s: Synonym): Seq[TokType] =
seq.zip(s.map(_.kind)).flatMap {
case (complex, kind) => if (complex.isLeft) Some(complex.swap.toOption.get -> kind) else None
}
/**
*
* @param tows
* @param ns
*/
private def toTokens(tows: Seq[NCIdlContent], ns: Sentence): Seq[NlpToken] =
(
tows.filter(_.isRight).map(_.toOption.get) ++
tows.filter(_.isLeft).map(_.swap.toOption.get).
flatMap(w => ns.filter(t => t.wordIndexes.intersect(w.wordIndexes).nonEmpty))
).sortBy(_.startCharIndex)
/**
*
* @param m
* @param id
* @return
*/
private def get(m: Map[String , Seq[Synonym]], id: String): Seq[Synonym] = m.getOrElse(id, Seq.empty)
/**
* Gets synonyms sorted in descending order by their weight (already prepared),
* i.e. first synonym in the sequence is the most important one.
*
* @param fastMap {Element ID -> {Synonym length -> T}}
* @param elmId
* @param len
*/
private def fastAccess[T](fastMap: Map[String, Map[Int, T]], elmId: String, len: Int): Option[T] =
fastMap.getOrElse(elmId, Map.empty[Int, T]).get(len)
/**
*
* @param mdl
* @param ns
*/
private def mkComplexes(mdl: NCProbeModel, ns: Sentence): ComplexHolder = {
val complexesWords = ns.map(Complex(_))
val complexes =
NCProbeVariants.convert(ns.srvReqId, mdl, NCSentenceManager.collapse(mdl.model, ns.clone())).
map(_.asScala).
par.
flatMap(sen =>
// Tokens splitting.
// For example sentence "A B С D E" (5 words) processed as 3 tokens on first phase after collapsing
// 'A B' (2 words), 'C D' (2 words) and 'E' (1 word)
// So, result combinations will be:
// Token(AB) + Token(CD) + Token(E)
// Token(AB) + Word(C) + Word(D) + Token(E)
// Word(A) + Word(B) + Token(CD) + Token(E)
// Word(A) + Word(B) + Word(C) + Word(D) + Token(E)
combos(sen).
map(senPartComb => {
sen.flatMap(t =>
// Single word token is not split as words - token.
// Partly (not strict in) token - word.
if (t.wordIndexes.length == 1 || senPartComb.contains(t))
Seq(Complex(t))
else
t.wordIndexes.map(complexesWords)
)
// Drops without tokens (IDL part works with tokens).
}).filter(_.exists(_.isToken)).map(ComplexSeq(_)).distinct
).seq
ComplexHolder(complexesWords, complexes)
}
/**
*
* @param h
* @param toks
*/
private def mkCombinations(h: ComplexHolder, toks: Seq[NlpToken], cache: Set[Seq[Complex]]): Seq[Seq[Complex]] = {
val idxs = toks.flatMap(_.wordIndexes).toSet
h.complexes.par.
flatMap(complexSeq => {
val rec = complexSeq.tokensComplexes.filter(_.wordIndexes.exists(idxs.contains))
// Drops without tokens (IDL part works with tokens).
if (rec.nonEmpty) {
val data = rec ++
(complexSeq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.complexesWords)
if (!cache.contains(data)) Some(data) else None
}
else
None
}).seq
}
private def add(
dbgType: String,
ns: Sentence,
contCache: Cache,
elem: NCElement,
elemToks: Seq[NlpToken],
sliceToksIdxs: Seq[Int],
syn: Synonym,
parts: Seq[TokType] = Seq.empty)
: Unit = {
val resIdxs = elemToks.map(_.index)
val resIdxsSorted = resIdxs.sorted
if (resIdxsSorted == sliceToksIdxs && U.isContinuous(resIdxsSorted))
contCache(elem.getId) += sliceToksIdxs
val ok = !alreadyMarked(ns, elem.getId, elemToks, sliceToksIdxs)
if (ok)
mark(ns, elem, elemToks, direct = syn.isDirect && U.isIncreased(resIdxs), syn = Some(syn), parts = parts)
if (DEEP_DEBUG)
logger.trace(
s"${if (ok) "Added" else "Skipped"} element [" +
s"id=${elem.getId}, " +
s"type=$dbgType, " +
s"text='${elemToks.map(_.origText).mkString(" ")}', " +
s"indexes=${resIdxs.mkString("[", ",", "]")}, " +
s"allTokensIndexes=${sliceToksIdxs.mkString("[", ",", "]")}, " +
s"synonym=$syn" +
s"]"
)
}
@throws[NCE]
override def enrich(mdl: NCProbeModel, ns: Sentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = {
require(isStarted)
startScopedSpan(
"enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
) { span =>
val req = NCRequestImpl(senMeta, ns.srvReqId)
val combToks = combos(ns.toSeq)
lazy val ch = mkComplexes(mdl, ns)
def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
startScopedSpan(
"execute", span, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
) { _ =>
if (DEEP_DEBUG)
logger.trace(s"Execution started [simpleEnabled=$simpleEnabled, idlEnabled=$idlEnabled]")
val contCache = mutable.HashMap.empty ++
mdl.elements.keys.map(k => k -> mutable.ArrayBuffer.empty[Seq[Int]])
lazy val idlCache = mutable.HashSet.empty[Seq[Complex]]
for (
toks <- combToks;
idxs = toks.map(_.index);
e <- mdl.elements.values;
eId = e.getId
if
!contCache(eId).exists(_.containsSlice(idxs)) &&
!alreadyMarked(ns, eId, toks, idxs)
) {
// 1. SIMPLE.
if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(eId) else !mdl.hasIdlSynonyms(eId))) {
lazy val tokStems = toks.map(_.stem).mkString(" ")
// 1.1 Continuous.
var found = false
if (mdl.hasContinuousSynonyms)
fastAccess(mdl.continuousSynonyms, eId, toks.length) match {
case Some(h) =>
def tryMap(syns: Map[String, Synonym], notFound: () => Unit): Unit =
syns.get(tokStems) match {
case Some(s) =>
found = true
add("simple continuous", ns, contCache, e, toks, idxs, s)
case None => notFound()
}
def tryScan(syns: Seq[Synonym]): Unit =
for (s <- syns if !found)
if (s.isMatch(toks)) {
found = true
add("simple continuous scan", ns, contCache, e, toks, idxs, s)
}
tryMap(
h.txtDirectSynonyms,
() => {
tryScan(h.notTxtDirectSynonyms)
if (!found)
tryMap(h.txtNotDirectSynonyms, () => tryScan(h.notTxtNotDirectSynonyms))
}
)
case None => // No-op.
}
// 1.2 Sparse.
if (!found && mdl.hasSparseSynonyms)
for (s <- get(mdl.sparseSynonyms, eId))
s.sparseMatch(toks) match {
case Some(res) => add("simple sparse", ns, contCache, e, res, idxs, s)
case None => // No-op.
}
}
// 2. IDL.
if (idlEnabled) {
val allSyns = get(mdl.idlSynonyms, eId)
lazy val allCombs = mkCombinations(ch, toks, idlCache.toSet)
// 2.1 Continuous.
if (!mdl.hasSparseSynonyms) {
var found = false
for (
s <- allSyns;
comb <- allCombs
if !found;
data = comb.map(_.data)
)
if (s.isMatch(data, req)) {
add("IDL continuous", ns, contCache, e, toks, idxs, s, toParts(data, s))
idlCache += comb
found = true
}
}
else
// 2.2 Sparse.
for (
s <- allSyns;
comb <- allCombs
)
s.sparseMatch(comb.map(_.data), req) match {
case Some(res) =>
val typ = if (s.sparse) "IDL sparse" else "IDL continuous"
add(typ, ns, contCache, e, toTokens(res, ns), idxs, s, toParts(res, s))
idlCache += comb
case None => // No-op.
}
}
}
}
if (ns.firstProbePhase) {
ns.firstProbePhase = false
if (mdl.hasNoIdlSynonyms)
execute(simpleEnabled = true, idlEnabled = false)
execute(simpleEnabled = mdl.hasNoIdlSynonyms, idlEnabled = mdl.hasIdlSynonyms)
}
else if (mdl.hasIdlSynonyms)
execute(simpleEnabled = false, idlEnabled = true)
processParsers(mdl, ns, span, req)
}
}
// TODO: simplify, add tests, check model properties (sparse etc) for optimization.
/**
*
* @param elmId
* @param toks
* @param sliceToksIdxsSorted
*/
private def alreadyMarked(ns: Sentence, elmId: String, toks: Seq[NlpToken], sliceToksIdxsSorted: Seq[Int]): Boolean = {
lazy val toksIdxsSorted = toks.map(_.index).sorted
sliceToksIdxsSorted.map(ns).forall(_.exists(n => n.noteType == elmId && n.sparsity == 0)) ||
toks.exists(_.exists(n =>
n.noteType == elmId &&
(
(n.sparsity == 0 &&
(sliceToksIdxsSorted.containsSlice(n.tokenIndexes) || n.tokenIndexes.containsSlice(toksIdxsSorted))
)
||
(
n.tokenIndexes == toksIdxsSorted ||
n.tokenIndexes.containsSlice(toksIdxsSorted) &&
U.isContinuous(toksIdxsSorted) &&
U.isContinuous(n.tokenIndexes)
)
)
))
}
}