blob: 7196985849fe0a937970f676bbc47f102166c58a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
import io.opencensus.trace.Span
import org.apache.nlpcraft.common._
import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
import org.apache.nlpcraft.model._
import org.apache.nlpcraft.model.impl.NCTokenImpl
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
import org.apache.nlpcraft.probe.mgrs.synonyms.NCSynonymsManager
import org.apache.nlpcraft.probe.mgrs.{NCProbeIdlToken => IdlToken, NCProbeModel, NCProbeVariants, NCTokenPartKey, NCProbeSynonym => Synonym}
import java.io.Serializable
import java.util.{List => JList}
import scala.collection.mutable
import scala.collection.parallel.CollectionConverters._
import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, MapHasAsScala, SeqHasAsJava}
/**
* Model elements enricher.
*/
object NCModelEnricher extends NCProbeEnricher {
type TokType = (NCToken, NCSynonymChunkKind)
object IdlTokensSeq {
def apply(all: Seq[IdlToken]): IdlTokensSeq = IdlTokensSeq(all.filter(_.isToken), all.flatMap(_.wordIndexes).toSet)
}
case class IdlTokensSeq(tokens: Seq[IdlToken], wordsIndexes: Set[Int]) {
private val (idxsSet: Set[Int], minIndex: Int, maxIndex: Int) = {
val seq = tokens.flatMap(_.wordIndexes).distinct.sorted
(seq.toSet, seq.head, seq.last)
}
def isIntersect(minIndex: Int, maxIndex: Int, idxsSet: Set[Int]): Boolean =
if (this.minIndex > maxIndex || this.maxIndex < minIndex)
false
else
this.idxsSet.exists(idxsSet.contains)
override def toString: String = tokens.mkString(" | ")
}
case class IdlTokensHolder(tokens: Seq[IdlToken], seqs: Seq[IdlTokensSeq])
/**
*
* @param parent Optional parent span.
* @return
*/
override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ =>
ackStarting()
ackStarted()
}
/**
*
* @param parent Optional parent span.
*/
override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ =>
ackStopping()
ackStopped()
}
def isComplex(mdl: NCProbeModel): Boolean = mdl.hasIdlSynonyms || !mdl.model.getParsers.isEmpty
/**
*
* @param ns
* @param elemId
* @param toks
* @param direct
* @param syn
* @param parts
* @param metaOpt
*/
private def mark(
ns: Sentence,
elemId: String,
toks: Seq[NlpToken],
direct: Boolean,
syn: Option[Synonym] = None,
parts: Seq[TokType] = Seq.empty,
metaOpt: Option[Map[String, Object]] = None
): Unit = {
val params = mutable.ArrayBuffer.empty[(String, AnyRef)]
// For system elements.
params += "direct" -> direct.asInstanceOf[AnyRef]
syn match {
case Some(s) =>
if (s.isValueSynonym)
params += "value" -> s.value
case None => // No-op.
}
metaOpt match {
case Some(meta) => params += "meta" -> meta
case None => // No-op.
}
if (parts.nonEmpty)
params += "parts" -> parts.map { case (p, kind) => NCTokenPartKey(p, kind) }.asJava
val idxs = toks.map(_.index).sorted
val note = NlpNote(idxs, elemId, params: _*)
toks.foreach(_.add(note))
// For NLP elements.
toks.foreach(t => ns.fixNote(t.getNlpNote, "direct" -> direct))
}
/**
*
* @param mdl
* @param ns
* @param span
* @param req
*/
private def processParsers(mdl: NCProbeModel, ns: Sentence, span: Span, req: NCRequestImpl): Unit = {
for (parser <- mdl.model.getParsers.asScala) {
parser.onInit()
startScopedSpan("customParser", span,
"srvReqId" -> ns.srvReqId,
"mdlId" -> mdl.model.getId,
"txt" -> ns.text
) { _ =>
def to(t: NlpToken): NCCustomWord =
new NCCustomWord {
override def getNormalizedText: String = t.normText
override def getOriginalText: String = t.origText
override def getStartCharIndex: Int = t.startCharIndex
override def getEndCharIndex: Int = t.endCharIndex
override def getPos: String = t.pos
override def getPosDescription: String = t.posDesc
override def getLemma: String = t.lemma
override def getStem: String = t.stem
override def isStopWord: Boolean = t.isStopWord
override def isBracketed: Boolean = t.isBracketed
override def isQuoted: Boolean = t.isQuoted
override def isKnownWord: Boolean = t.isKnownWord
override def isSwearWord: Boolean = t.isSwearWord
override def isEnglish: Boolean = t.isEnglish
}
val res = parser.parse(
req,
mdl.model,
ns.map(to).asJava,
ns.flatten.distinct.filter(!_.isNlp).map(n => {
val noteId = n.noteType
val words = ns.filter(t => n.tokenIndexes.contains(t.index)).map(to).asJava
val md = n.asMetadata()
new NCCustomElement() {
override def getElementId: String = noteId
override def getWords: JList[NCCustomWord] = words
override def getMetadata: JavaMeta = md.map(p => p._1 -> p._2.asInstanceOf[AnyRef]).asJava
}
}).asJava
)
if (res != null)
res.asScala.foreach(e => {
val elmId = e.getElementId
val words = e.getWords
if (elmId == null)
throw new NCE(s"Custom model parser cannot return 'null' element ID.")
if (words == null || words.isEmpty)
throw new NCE(s"Custom model parser cannot return empty custom tokens for element: $elmId")
val matchedToks = words.asScala.map(w =>
ns.find(t =>
t.startCharIndex == w.getStartCharIndex && t.endCharIndex == w.getEndCharIndex
).getOrElse(throw new AssertionError(s"Custom model parser returned an invalid custom token: $w"))
)
// Checks element's tokens.
if (!alreadyMarked(ns, elmId, matchedToks, matchedToks.map(_.index).sorted))
mark(
ns,
elemId = elmId,
toks = matchedToks,
direct = true,
metaOpt = Some(e.getMetadata.asScala.toMap)
)
})
}
parser.onDiscard()
}
}
/**
*
* 1. Prepares combination of tokens (sliding).
* Example: 'A B C D' -> {'A B C', 'A B', 'B C', 'A', 'B', 'C'}
* One sentence converted to 4 pieces.
*
* 2. Additionally, each piece converted into set of elements with all possible its stopwords permutations.
* Example: Piece: 'x1, x2(stopword), x3(stopword), x4' will be expanded into
* {'x1, x2, x3, x4', 'x1, x2, x4', 'x1, x3, x4', 'x1, x4'}
*
* 3. All variants collected, duplicated deleted, etc.
*
* @param toks
*/
private def combosTokens(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], Seq[NlpToken])] =
combos(toks).flatMap(combo => {
val stops = combo.filter(s => s.isStopWord && s != combo.head && s != combo.last)
val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[NlpToken]]
for (stop <- stops)
if (slides.nonEmpty && slides.last.last.index + 1 == stop.index)
slides.last += stop
else
slides += mutable.ArrayBuffer.empty :+ stop
// Too many stopords inside skipped.
val bigSlides = slides.filter(_.size > 2)
var stops4Delete: Seq[Seq[NlpToken]] =
if (bigSlides.nonEmpty) {
val allBig = bigSlides.flatten
val stops4AllCombs = stops.filter(p => !allBig.contains(p))
if (stops4AllCombs.nonEmpty)
for (
seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)
)
yield seq1 ++ seq2.flatten
else
for (seq <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations))
yield seq.toSeq.flatten
}
else
Range.inclusive(1, stops.size).flatMap(stops.combinations)
stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last))
(Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct
}).
filter(_._1.nonEmpty).
groupBy(_._1).
map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))).
sortBy { case(data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) }
/**
*
* @param toks
*/
private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
(for (n <- toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p => p)
/**
*
* @param seq
* @param s
*/
private def toParts(mdl: NCProbeModel, stvReqId: String, seq: Seq[IdlToken], s: Synonym): Seq[TokType] =
seq.zip(s.map(_.kind)).flatMap {
case (idlTok, kind) =>
val t = if (idlTok.isToken) idlTok.token else mkNlpToken(mdl, stvReqId, idlTok.word)
Some(t -> kind)
}
/**
*
* @param tows
* @param ns
*/
private def toTokens(tows: Seq[IdlToken], ns: Sentence): Seq[NlpToken] =
(
tows.filter(_.isWord).map(_.word) ++
tows.filter(_.isToken).map(_.token).
flatMap(w => ns.filter(t => t.wordIndexes.intersect(w.wordIndexes).nonEmpty))
).sortBy(_.startCharIndex)
/**
*
* @param m
* @param id
*/
private def get(m: Map[String , Seq[Synonym]], id: String): Seq[Synonym] = m.getOrElse(id, Seq.empty)
/**
* Gets synonyms sorted in descending order by their weight (already prepared),
* i.e. first synonym in the sequence is the most important one.
*
* @param fastMap {Element ID -> {Synonym length -> T}}
* @param elmId
* @param len
*/
private def fastAccess[T](fastMap: Map[String, Map[Int, T]], elmId: String, len: Int): Option[T] =
fastMap.getOrElse(elmId, Map.empty[Int, T]).get(len)
/**
*
* @param mdl
* @param ns
*/
private def mkHolder(mdl: NCProbeModel, ns: Sentence): IdlTokensHolder = {
val toks = ns.map(IdlToken(_))
val seqs =
NCProbeVariants.convert(ns.srvReqId, mdl, NCSentenceManager.collapse(mdl.model, ns.clone())).
map(_.asScala).
par.
flatMap(sen =>
// Tokens splitting.
// For example sentence "A B С D E" (5 words) processed as 3 tokens on first phase after collapsing
// 'A B' (2 words), 'C D' (2 words) and 'E' (1 word)
// So, result combinations will be:
// Token(AB) + Token(CD) + Token(E)
// Token(AB) + Word(C) + Word(D) + Token(E)
// Word(A) + Word(B) + Token(CD) + Token(E)
// Word(A) + Word(B) + Word(C) + Word(D) + Token(E)
combos(sen).
map(senPartComb => {
sen.flatMap(t =>
// Single word token is not split as words - token.
// Partly (not strict in) token - word.
if (t.wordIndexes.length == 1 || senPartComb.contains(t))
Seq(IdlToken(t))
else
t.wordIndexes.map(toks)
)
// Drops without tokens (IDL part works with tokens).
}).filter(_.exists(_.isToken)).map(IdlTokensSeq(_)).distinct
).seq
IdlTokensHolder(toks, seqs)
}
/**
*
* @param mdl
* @param srvReqId
* @param t
*/
private def mkNlpToken(mdl: NCProbeModel, srvReqId: String, t: NlpToken): NCToken = {
val notes = mutable.HashSet.empty[NlpNote]
notes += t.getNlpNote
NCTokenImpl(mdl, srvReqId, NlpToken(t.index, notes, t.stopsReasons))
}
/**
*
* @param h
* @param toks
*/
private def mkCombinations(h: IdlTokensHolder, toks: Seq[NlpToken]): Seq[Seq[IdlToken]] = {
val idxs = toks.flatMap(_.wordIndexes).toSet
h.seqs.par.
flatMap(seq => {
val rec = seq.tokens.filter(_.wordIndexes.exists(idxs.contains))
// Drops without tokens (IDL part works with tokens).
if (rec.nonEmpty)
Some(rec ++
(seq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.tokens)
)
else
None
}).seq
}
/**
*
* @param matched
* @param toks2Match
*/
private def getSparsedTokens(matched: Seq[NlpToken], toks2Match: Seq[NlpToken]): Seq[NlpToken] = {
require(matched.nonEmpty)
// Matched tokens should be already sorted.
val stopsInside = toks2Match.filter(t =>
t.isStopWord && !matched.contains(matched) && t.index > matched.head.index && t.index < matched.last.index
)
if (stopsInside.nonEmpty) (matched ++ stopsInside).sortBy(_.index) else matched
}
@throws[NCE]
override def enrich(mdl: NCProbeModel, ns: Sentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = {
require(isStarted)
startScopedSpan(
"enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
) { span =>
val req = NCRequestImpl(senMeta, ns.srvReqId)
lazy val ch = mkHolder(mdl, ns)
lazy val variantsToks =
ch.seqs.map(
p => p.tokens.map(p => if (p.isToken) p.token else mkNlpToken(mdl, ns.srvReqId, p.word))
)
def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
startScopedSpan(
"execute", span, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
) { _ =>
if (DEEP_DEBUG)
logger.trace(s"Execution started [simpleEnabled=$simpleEnabled, idlEnabled=$idlEnabled]")
for (
// 'toksExt' is piece of sentence, 'toks' is the same as 'toksExt' or without some stopwords set.
(toks, toksExt) <- combosTokens(ns.toSeq);
idxs = toks.map(_.index);
e <- mdl.elements.values;
elemId = e.getId;
greedy = e.isGreedy.orElse(mdl.model.isGreedy)
if !greedy || !alreadyMarked(ns, elemId, toks, idxs)
) {
def add(
dbgType: String,
elemToks: Seq[NlpToken],
syn: Synonym,
parts: Seq[TokType] = Seq.empty
): Unit = {
val resIdxs = elemToks.map(_.index)
val ok =
(!greedy || !alreadyMarked(ns, elemId, elemToks, idxs)) &&
( parts.isEmpty || !parts.exists { case (t, _) => t.getId == elemId })
if (ok)
mark(
ns,
elemId,
elemToks,
direct = syn.isDirect && U.isIncreased(resIdxs),
syn = Some(syn),
parts = parts
)
if (DEEP_DEBUG)
logger.trace(
s"${if (ok) "Added" else "Skipped"} element [" +
s"id=$elemId, " +
s"type=$dbgType, " +
s"text='${elemToks.map(_.origText).mkString(" ")}', " +
s"indexes=${resIdxs.mkString("[", ",", "]")}, " +
s"allTokensIndexes=${idxs.mkString("[", ",", "]")}, " +
s"synonym=$syn" +
s"]"
)
}
// 1. SIMPLE.
if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(elemId) else !mdl.hasIdlSynonyms(elemId))) {
lazy val tokStems = toks.map(_.stem).mkString(" ")
// 1.1 Continuous.
var found = false
if (mdl.hasContinuousSynonyms)
fastAccess(mdl.continuousSynonyms, elemId, toks.length) match {
case Some(h) =>
def tryMap(syns: Map[String, Synonym], notFound: () => Unit): Unit =
syns.get(tokStems) match {
case Some(s) =>
found = true
add("simple continuous", toksExt, s)
case None => notFound()
}
def tryScan(syns: Seq[Synonym]): Unit =
for (syn <- syns if !found)
NCSynonymsManager.onMatch(
ns.srvReqId,
elemId,
syn,
toks,
_ => {
found = true
add("simple continuous scan", toksExt, syn)
}
)
tryMap(
h.txtDirectSynonyms,
() => {
tryScan(h.notTxtDirectSynonyms)
if (!found)
tryMap(h.txtNotDirectSynonyms, () => tryScan(h.notTxtNotDirectSynonyms))
}
)
case None => // No-op.
}
// 1.2 Sparse.
if (!found && mdl.hasSparseSynonyms)
for (syn <- get(mdl.sparseSynonyms, elemId))
NCSynonymsManager.onSparseMatch(
ns.srvReqId,
elemId,
syn,
toks,
res => add("simple sparse", getSparsedTokens(res, toks), syn)
)
}
// 2. IDL.
if (idlEnabled) {
val allSyns = get(mdl.idlSynonyms, elemId)
lazy val allCombs = mkCombinations(ch, toks)
// 2.1 Continuous.
if (!mdl.hasSparseSynonyms) {
var found = false
for (syn <- allSyns; comb <- allCombs; if !found)
NCSynonymsManager.onMatch(
ns.srvReqId,
elemId,
syn,
comb,
req,
variantsToks,
_ => {
val parts = toParts(mdl, ns.srvReqId, comb, syn)
add("IDL continuous", toksExt, syn, parts)
found = true
}
)
}
else
// 2.2 Sparse.
for (syn <- allSyns; comb <- allCombs)
NCSynonymsManager.onSparseMatch(
ns.srvReqId,
elemId,
syn,
comb,
req,
variantsToks,
res => {
val toks = getSparsedTokens(toTokens(res, ns), toTokens(comb, ns))
val parts = toParts(mdl, ns.srvReqId, res, syn)
val typ = if (syn.sparse) "IDL sparse"else "IDL continuous"
add(typ, toks, syn, parts)
}
)
}
}
}
if (ns.firstProbePhase) {
ns.firstProbePhase = false
if (mdl.hasNoIdlSynonyms)
execute(simpleEnabled = true, idlEnabled = false)
execute(simpleEnabled = mdl.hasNoIdlSynonyms, idlEnabled = mdl.hasIdlSynonyms)
}
else if (mdl.hasIdlSynonyms)
execute(simpleEnabled = false, idlEnabled = true)
processParsers(mdl, ns, span, req)
}
NCSynonymsManager.clearIteration(ns.srvReqId)
normalize(ns)
}
/**
*
* @param ns
*/
private def normalize(ns: Sentence): Unit = {
val usrNotes = ns.flatten.filter(_.isUser).distinct
val links = NCSentenceManager.getLinks(usrNotes)
val parts = NCSentenceManager.getPartKeys(usrNotes)
val usrNotesIdxs = usrNotes.
filter(n => !links.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))).
filter(n => !parts.contains(NCTokenPartKey(n, ns))).
zipWithIndex
usrNotesIdxs.
foreach { case (n, idx) =>
usrNotesIdxs.find { case (candidate, candidateIdx) =>
candidateIdx != idx &&
candidate.noteType == n.noteType &&
candidate.dataOpt("parts") == n.dataOpt("parts") &&
candidate.wordIndexesSet.subsetOf(n.wordIndexesSet) &&
n.wordIndexes.filter(n => !candidate.wordIndexes.contains(n)).
forall(wordIdx => ns.tokens.exists(t => t.wordIndexes.contains(wordIdx) && t.isStopWord))
} match {
case Some(better) =>
ns.removeNote(n)
logger.trace(s"Element removed: $n, better: $better")
case None => // No-op.
}
}
}
// TODO: simplify, add tests, check model properties (sparse etc) for optimization.
/**
*
* @param elmId
* @param toks
* @param sliceToksIdxsSorted
*/
private def alreadyMarked(ns: Sentence, elmId: String, toks: Seq[NlpToken], sliceToksIdxsSorted: Seq[Int]): Boolean = {
lazy val toksIdxsSorted = toks.map(_.index).sorted
sliceToksIdxsSorted.map(ns).forall(_.exists(n => n.noteType == elmId && n.sparsity == 0)) ||
toks.exists(_.exists(n =>
n.noteType == elmId &&
(
(n.sparsity == 0 &&
(sliceToksIdxsSorted.containsSlice(n.tokenIndexes) || n.tokenIndexes.containsSlice(toksIdxsSorted))
)
||
(
n.tokenIndexes == toksIdxsSorted ||
n.tokenIndexes.containsSlice(toksIdxsSorted) &&
U.isContinuous(toksIdxsSorted) &&
U.isContinuous(n.tokenIndexes)
)
)
))
}
}