blob: 420e6c4734ba2212a7ce23674d8b63aecd823733 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
import java.io.Serializable
import java.util
import io.opencensus.trace.Span
import org.apache.nlpcraft.common._
import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, _}
import org.apache.nlpcraft.model._
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym, NCProbeVariants}
import scala.collection.JavaConverters._
import scala.collection.convert.DecorateAsScala
import scala.collection.mutable.ArrayBuffer
import scala.collection.{Map, Seq, mutable}
/**
* Model elements enricher.
*/
object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
case class Complex(data: Either[NCToken, NCNlpSentenceToken]) {
lazy val isToken: Boolean = data.isLeft
lazy val isWord: Boolean = data.isRight
lazy val token: NCToken = data.left.get
lazy val word: NCNlpSentenceToken = data.right.get
lazy val origText: String = if (isToken) token.origText else word.origText
lazy val wordIndexes: Seq[Int] = if (isToken) token.wordIndexes else word.wordIndexes
private lazy val hash = if (isToken) token.hashCode() else word.hashCode()
override def hashCode(): Int = hash
override def equals(obj: Any): Boolean = obj match {
case x: Complex ⇒ isToken && x.isToken && token == x.token || isWord && x.isWord && word == x.word
case _ ⇒ false
}
// Added for debug reasons.
override def toString: String =
if (isToken) s"Token: '${token.origText} (${token.getId})'" else s"Word: '${word.origText}'"
}
// Found-by-synonym model element.
case class ElementMatch(
element: NCElement,
tokens: Seq[NCNlpSentenceToken],
synonym: NCProbeSynonym,
parts: Seq[NCToken]
) extends Ordered[ElementMatch] {
// Tokens sparsity.
lazy val sparsity: Int = tokens.zipWithIndex.tail.map {
case (tok, idx)Math.abs(tok.index - tokens(idx - 1).index)
}.sum - tokens.length + 1
// Number of tokens.
lazy val length: Int = tokens.size
private lazy val tokensSet = tokens.toSet
def isSubSet(toks: Set[NCNlpSentenceToken]): Boolean = toks.subsetOf(tokensSet)
override def compare(that: ElementMatch): Int = {
// Check synonym first, then length and then sparsity.
// Note that less sparsity means more certainty in a match.
if (that == null)
1
else if (synonym < that.synonym)
-1
else if (synonym > that.synonym)
1
else if (length < that.length)
-1
else if (length > that.length)
1
else if (sparsity < that.sparsity)
1
else if (sparsity > that.sparsity)
-1
else
0
}
}
override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
super.start()
}
override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
super.stop()
}
/**
* Returns an iterator of tokens arrays where each token is jiggled left and right by given factor.
* Note that only one token is jiggled at a time.
*
* @param ns NLP sentence to jiggle.
* @param factor Distance of left or right jiggle, i.e. how far can an individual token move
* left or right in the sentence.
*/
private def jiggle(ns: NCNlpSentenceTokenBuffer, factor: Int): Iterator[NCNlpSentenceTokenBuffer] = {
require(factor >= 0)
if (ns.isEmpty)
Iterator.empty
else if (factor == 0)
Iterator.apply(ns)
else
new Iterator[NCNlpSentenceTokenBuffer] {
private val min = -factor
private val max = factor
private val sz = ns.size
private var i = 0 // Token index.
private var d = 0 // Jiggle amount [min, max].
private var isNext = sz > 0
private def calcNext(): Unit = {
isNext = false
d += 1
while (i < sz && !isNext) {
while (d <= max && !isNext) {
val p = i + d
if (p >= 0 && p < sz) // Valid new position?
isNext = true
else
d += 1
}
if (!isNext) {
d = min
i += 1
}
}
}
override def hasNext: Boolean = isNext
override def next(): NCNlpSentenceTokenBuffer = {
require(isNext)
val buf = NCNlpSentenceTokenBuffer(ns)
if (d != 0)
buf.insert(i + d, buf.remove(i)) // Jiggle.
calcNext()
buf
}
}
}
/**
*
* @param ns
* @param elem
* @param toks
* @param direct
* @param syn
* @param metaOpt
* @param parts
*/
private def mark(
ns: NCNlpSentence,
elem: NCElement,
toks: Seq[NCNlpSentenceToken],
direct: Boolean,
syn: Option[NCProbeSynonym],
metaOpt: Option[Map[String, Object]],
parts: Seq[NCToken]
): Unit = {
val params = mutable.ArrayBuffer.empty[(String, AnyRef)]
// For system elements.
params += "direct" → direct.asInstanceOf[AnyRef]
syn match {
case Some(s)
if (s.isValueSynonym)
params += "value" → s.value
case None// No-op.
}
metaOpt match {
case Some(meta) ⇒ params += "meta" → meta
case None// No-op.
}
if (parts.nonEmpty) {
val partsData: Seq[util.HashMap[String, Any]] =
parts.map(part ⇒ {
val m = new util.HashMap[String, Any]()
m.put("id", part.getId)
m.put("startcharindex", part.getStartCharIndex)
m.put("endcharindex", part.getEndCharIndex)
m.put(TOK_META_ALIASES_KEY, part.getMetadata.get(TOK_META_ALIASES_KEY))
m
})
params += "parts" → partsData.asJava
}
val idxs = toks.map(_.index).sorted
val note = NCNlpSentenceNote(idxs, elem.getId, params: _*)
toks.foreach(_.add(note))
// For NLP elements.
toks.foreach(t ⇒ ns.fixNote(t.getNlpNote, "direct" → direct))
}
/**
* Gets all sequential permutations of given tokens.
*
* For example, if buffer contains "a b c d" tokens, then this function will return the
* sequence of following token sequences in this order:
* "a b c d"
* "a b c"
* "b c d"
* "a b"
* "b c"
* "c d"
* "a"
* "b"
* "c"
* "d"
*
* @param toks
* @return
*/
protected def combos[T](toks: Seq[T]): Seq[Seq[T]] =
(for (n ← toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p ⇒ p)
/**
*
* @param initialSen
* @param collapsedSen
* @param nlpToks
*/
private def convert(
initialSen: NCNlpSentence, collapsedSen: Seq[Seq[NCToken]], nlpToks: Seq[NCNlpSentenceToken]
): Seq[Seq[Complex]] = {
val nlpWordIdxs = nlpToks.flatMap(_.wordIndexes)
def in(t: NCToken): Boolean = t.wordIndexes.exists(nlpWordIdxs.contains)
def inStrict(t: NCToken): Boolean = t.wordIndexes.forall(nlpWordIdxs.contains)
def isSingleWord(t: NCToken): Boolean = t.wordIndexes.length == 1
collapsedSen.
map(_.filter(in)).
filter(_.nonEmpty).flatMap(varToks ⇒
// Tokens splitting.
// For example sentence "A B С D E" (5 words) processed as 3 tokens on first phase after collapsing
// 'A B' (2 words), 'C D' (2 words) and 'E' (1 word)
// So, result combinations will be:
// Token(AB) + Token(CD) + Token(E)
// Token(AB) + Word(C) + Word(D) + Token(E)
// Word(A) + Word(B) + Token(CD) + Token(E)
// Word(A) + Word(B) + Word(C) + Word(D) + Token(E)
combos(varToks).map(toksComb ⇒
varToks.flatMap(t ⇒
// Single word token is not split as words - token.
// Partly (not strict in) token - word.
if ((toksComb.contains(t) || isSingleWord(t)) && inStrict(t))
Seq(Complex(Left(t)))
else
t.wordIndexes.filter(nlpWordIdxs.contains).map(i ⇒ Complex(Right(initialSen(i))))
)
).filter(_.exists(_.isToken)) // Drops without tokens (DSL part works with tokens).
).distinct
}
/**
*
* @param toks
* @param elemId
*/
private def alreadyMarked(toks: Seq[NCNlpSentenceToken], elemId: String): Boolean = toks.forall(_.isTypeOf(elemId))
def isComplex(mdl: NCProbeModel): Boolean = mdl.synonymsDsl.nonEmpty || !mdl.model.getParsers.isEmpty
@throws[NCE]
override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit =
startScopedSpan("enrich", parent,
"srvReqId" → ns.srvReqId,
"mdlId" → mdl.model.getId,
"txt" → ns.text) { span ⇒
val jiggleFactor = mdl.model.getJiggleFactor
val cache = mutable.HashSet.empty[Seq[Int]]
val matches = ArrayBuffer.empty[ElementMatch]
/**
* Gets sequence of synonyms sorted in descending order by their weight, i.e. first synonym in
* the sequence is the most important one.
*
* @param fastMap
* @param elmId
* @param len
* @return
*/
def fastAccess(
fastMap: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ , Seq[NCProbeSynonym]]],
elmId: String,
len: Int): Seq[NCProbeSynonym] =
fastMap.get(elmId).flatMap(_.get(len)) match {
case Some(seq) ⇒ seq
case NoneSeq.empty[NCProbeSynonym]
}
/**
*
* @param toks
* @return
*/
def tokString(toks: Seq[NCNlpSentenceToken]): String =
toks.map(t ⇒ (t.origText, t.index)).mkString(" ")
var permCnt = 0
var collapsedSens: Seq[Seq[NCToken]] = null
/**
*
* @param perm Permutation to process.
*/
def procPerm(perm: NCNlpSentenceTokenBuffer): Unit = {
permCnt += 1
for (toks ← combos(perm)) {
val key = toks.map(_.index).sorted
if (!cache.contains(key)) {
var seq: Seq[Seq[Complex]] = null
// Attempt to match each element.
for (elm ← mdl.elements.values if !alreadyMarked(toks, elm.getId)) {
var found = false
def addMatch(
elm: NCElement, toks: Seq[NCNlpSentenceToken], syn: NCProbeSynonym, parts: Seq[NCToken]
): Unit =
if (!matches.exists(m ⇒ m.element == elm && m.isSubSet(toks.toSet))) {
found = true
matches += ElementMatch(elm, toks, syn, parts)
}
// Optimization - plain synonyms can be used only on first iteration
if (mdl.synonyms.nonEmpty && !ns.exists(_.isUser))
for (syn ← fastAccess(mdl.synonyms, elm.getId, toks.length) if !found)
if (syn.isMatch(toks))
addMatch(elm, toks, syn, Seq.empty)
if (mdl.synonymsDsl.nonEmpty) {
found = false
if (collapsedSens == null)
collapsedSens = NCProbeVariants.convert(ns.srvReqId, mdl, ns.clone().collapse()).map(_.asScala)
if (seq == null)
seq = convert(ns, collapsedSens, toks)
for (comb ← seq; syn ← fastAccess(mdl.synonymsDsl, elm.getId, comb.length) if !found)
if (syn.isMatch(comb.map(_.data)))
addMatch(elm, toks, syn, comb.filter(_.isToken).map(_.token))
}
}
cache += key
}
}
}
startScopedSpan("jiggleProc", span,
"srvReqId" → ns.srvReqId,
"mdlId" → mdl.model.getId,
"txt" → ns.text) { _ ⇒
// Iterate over depth-limited permutations of the original sentence with and without stopwords.
jiggle(ns, jiggleFactor).foreach(procPerm)
jiggle(NCNlpSentenceTokenBuffer(ns.filter(!_.isStopWord)), jiggleFactor).foreach(procPerm)
}
if (DEEP_DEBUG)
logger.trace(s"Total jiggled permutations processed: $permCnt")
addTags(
span,
"totalJiggledPerms" → permCnt
)
val matchCnt = matches.size
// Add notes for all remaining (non-intersecting) matches.
for ((m, idx) ← matches.zipWithIndex) {
if (DEEP_DEBUG)
logger.trace(
s"Model '${mdl.model.getId}' element found (${idx + 1} of $matchCnt) [" +
s"elementId=${m.element.getId}, " +
s"synonym=${m.synonym}, " +
s"tokens=${tokString(m.tokens)}" +
s"]"
)
val elm = m.element
val syn = m.synonym
val tokIdxs = m.tokens.map(_.index)
val direct = syn.isDirect && (tokIdxs == tokIdxs.sorted)
mark(ns, elem = elm, toks = m.tokens, direct = direct, syn = Some(syn), metaOpt = None, parts = m.parts)
}
val parsers = mdl.model.getParsers
for (parser ← parsers.asScala) {
parser.onInit()
startScopedSpan("customParser", span,
"srvReqId" → ns.srvReqId,
"mdlId" → mdl.model.getId,
"txt" → ns.text) { _ ⇒
def to(t: NCNlpSentenceToken): NCCustomWord =
new NCCustomWord {
override def getNormalizedText: String = t.normText
override def getOriginalText: String = t.origText
override def getStartCharIndex: Int = t.startCharIndex
override def getEndCharIndex: Int = t.endCharIndex
override def getPos: String = t.pos
override def getPosDescription: String = t.posDesc
override def getLemma: String = t.lemma
override def getStem: String = t.stem
override def isStopWord: Boolean = t.isStopWord
override def isBracketed: Boolean = t.isBracketed
override def isQuoted: Boolean = t.isQuoted
override def isKnownWord: Boolean = t.isKnownWord
override def isSwearWord: Boolean = t.isSwearWord
override def isEnglish: Boolean = t.isEnglish
}
val res = parser.parse(
NCRequestImpl(senMeta, ns.srvReqId),
mdl.model,
ns.map(to).asJava,
ns.flatten.distinct.filter(!_.isNlp).map(n ⇒ {
val noteId = n.noteType
val words = ns.filter(t ⇒ t.index >= n.tokenFrom && t.index <= n.tokenTo).map(to).asJava
val md = n.asMetadata()
new NCCustomElement() {
override def getElementId: String = noteId
override def getWords: util.List[NCCustomWord] = words
override def getMetadata: util.Map[String, AnyRef] =
md.map(p ⇒ p._1 → p._2.asInstanceOf[AnyRef]).asJava
}
}).asJava
)
if (res != null)
res.asScala.foreach(e ⇒ {
val elemId = e.getElementId
val words = e.getWords
if (elemId == null)
throw new NCE(s"Custom model parser cannot return 'null' element ID.")
if (words == null || words.isEmpty)
throw new NCE(s"Custom model parser cannot return empty custom tokens [elementId=$elemId]")
val matchedToks = words.asScala.map(w ⇒
ns.find(t ⇒
t.startCharIndex == w.getStartCharIndex && t.endCharIndex == w.getEndCharIndex
).getOrElse(throw new AssertionError(s"Custom model parser returned an invalid custom token: $w"))
)
if (!alreadyMarked(matchedToks, elemId))
mark(
ns,
elem = mdl.elements.getOrElse(elemId, throw new NCE(s"Custom model parser returned unknown element ID: $elemId")),
toks = matchedToks,
direct = true,
syn = None,
metaOpt = Some(e.getMetadata.asScala),
parts = Seq.empty
)
})
}
parser.onDiscard()
}
}
}