blob: d7560b91f14ae5001e1cc0f7cb45927fbbd703ac [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.limit
import java.io.Serializable
import io.opencensus.trace.Span
import org.apache.nlpcraft.common.makro.NCMacroParser
import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
import org.apache.nlpcraft.common.nlp.numeric.{NCNumeric, NCNumericManager}
import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
import org.apache.nlpcraft.common.{NCE, NCService}
import org.apache.nlpcraft.probe.mgrs.NCProbeModel
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
import scala.collection.JavaConverters._
import scala.collection.{Map, Seq, mutable}
/**
* Limit enricher.
*/
object NCLimitEnricher extends NCProbeEnricher {
case class Match(
limit: Double,
asc: Option[Boolean],
matched: Seq[NCNlpSentenceToken],
refNotes: Set[String],
refIndexes: java.util.List[Int]
)
private final val TOK_ID = "nlpcraft:limit"
// It designates:
// - digits (like `25`),
// - word numbers (like `twenty two`) or
// - fuzzy numbers (like `few`).
private final val CD = "[CD]"
// Possible elements:
// - Any macros,
// - Special symbol CD (which designates obvious number or fuzzy number word)
// - Any simple word.
// Note that `CD` is optional (DFLT_LIMIT will be used)
private final val SYNONYMS = Seq(
s"<TOP_WORDS> {of|*} {$CD|*} {<POST_WORDS>|*}",
s"$CD of",
s"$CD <POST_WORDS>",
s"<POST_WORDS> $CD"
)
private final val DFLT_LIMIT = 10
/**
* Group of neighbouring tokens. All of them numbers or all of the not.
*
* @param tokens Tokens.
* @param number Tokens numeric value. Optional.
* @param isFuzzyNum Fuzzy value flag.
*/
case class Group(tokens: Seq[NCNlpSentenceToken], number: Option[Int], isFuzzyNum: Boolean) {
lazy val value: String = number match {
case Some(_) ⇒ CD
case None ⇒ tokens.map(_.stem).mkString(" ")
}
lazy val index: Int = tokens.head.index
}
/**
* Neighbouring groups.
*
* @param groups Groups.
*/
case class GroupsHolder(groups: Seq[Group]) {
lazy val tokens: Seq[NCNlpSentenceToken] = groups.flatMap(_.tokens)
lazy val limit: Int = {
val numElems = groups.filter(_.number.isDefined)
numElems.size match {
case 0 ⇒ DFLT_LIMIT
case 1 ⇒ numElems.head.number.get
case _ ⇒ throw new AssertionError(s"Unexpected numeric count in template: ${numElems.size}")
}
}
lazy val asc: Boolean = {
val sorts: Seq[Boolean] = tokens.map(_.stem).flatMap(sortWords.get)
sorts.size match {
case 1 ⇒ sorts.head
case _ ⇒ false
}
}
lazy val value: String = groups.map(_.value).mkString(" ")
lazy val isFuzzyNum: Boolean = groups.size == 1 && groups.head.isFuzzyNum
}
@volatile private var fuzzyNums: Map[String, Int] = _
@volatile private var sortWords: Map[String, Boolean] = _
@volatile private var topWords: Seq[String] = _
@volatile private var postWords: Seq[String] = _
@volatile private var macros: Map[String, Iterable[String]] = _
@volatile private var limits: Seq[String] = _
@volatile private var techWords: Set[String] = _
/**
* Stemmatizes map's keys.
*
* @param m Map.
*/
private def stemmatizeWords[T](m: Map[String, T]): Map[String, T] = m.map(p ⇒ NCNlpCoreManager.stem(p._1) → p._2)
/**
*
* @param t
*/
private def isUserNotValue(t: NCNlpSentenceToken): Boolean =
t.find(_.isUser) match {
case Some(n)!n.contains("value")
case Nonefalse
}
/**
*
* @param n
*/
private def isUserNotValue(n: NCNlpSentenceNote): Boolean = n.isUser && !n.contains("value")
/**
* Starts this component.
*/
override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
// Note that single words only supported now in code.
fuzzyNums = stemmatizeWords(Map(
"few"3,
"several"3,
"handful"5,
"single"1,
"some"3,
"couple"2
))
// Note that single words only supported now in code.
sortWords = stemmatizeWords(Map(
"top"false,
"most"false,
"first"false,
"bottom"true,
"last"true
))
topWords = Seq(
"top",
"most",
"bottom",
"first",
"last"
).map(NCNlpCoreManager.stem)
postWords = Seq(
"total",
"all together",
"overall"
).map(NCNlpCoreManager.stem)
// Macros: SORT_WORDS, TOP_WORDS, POST_WORDS
macros = Map(
"SORT_WORDS" → sortWords.keys,
"TOP_WORDS" → topWords,
"POST_WORDS" → postWords
)
limits= {
// Few numbers cannot be in on template.
require(SYNONYMS.forall(_.split(" ").map(_.trim).count(_ == CD) < 2))
def toMacros(seq: Iterable[String]): String = seq.mkString("|")
val parser = NCMacroParser(macros.map { case (name, seq) ⇒ s"<$name>" → s"{${toMacros(seq)}}" })
// Duplicated elements is not a problem.
SYNONYMS.flatMap(parser.expand).distinct
}
techWords = (sortWords.keys ++ topWords ++ postWords ++ fuzzyNums.keySet).toSet
super.start()
}
override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
super.stop()
fuzzyNums = null
sortWords = null
topWords = null
postWords = null
macros = null
limits = null
techWords = null
}
/**
* Checks whether important tokens deleted as stopwords or not.
*
* @param ns Sentence.
* @param toks Tokens in which some stopwords can be deleted.
*/
private def validImportant(ns: NCNlpSentence, toks: Seq[NCNlpSentenceToken]): Boolean = {
def isImportant(t: NCNlpSentenceToken): Boolean = isUserNotValue(t) || techWords.contains(t.stem)
val idxs = toks.map(_.index)
require(idxs == idxs.sorted)
val toks2 = ns.slice(idxs.head, idxs.last + 1)
toks.length == toks2.length || toks.count(isImportant) == toks2.count(isImportant)
}
@throws[NCE]
override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit =
startScopedSpan("enrich", parent,
"srvReqId" → ns.srvReqId,
"mdlId" → mdl.model.getId,
"txt" → ns.text) { _ ⇒
val notes = mutable.HashSet.empty[NCNlpSentenceNote]
val numsMap = NCNumericManager.find(ns).filter(_.unit.isEmpty).map(p ⇒ p.tokens → p).toMap
val groupsMap = groupNums(ns, numsMap.values)
// Tries to grab tokens reverse way.
// Example: A, B, C ⇒ ABC, BC, AB .. (BC will be processed first)
for (toks ← ns.tokenMixWithStopWords().sortBy(p ⇒ (-p.size, -p.head.index)) if validImportant(ns, toks))
tryToMatch(numsMap, groupsMap, toks) match {
case Some(m)
for (refNote ← m.refNotes) {
val params = mutable.ArrayBuffer.empty[(String, Any)]
params += "limit" → m.limit
params += "indexes" → m.refIndexes
params += "note" → refNote
if (m.asc.isDefined)
params += "asc" → m.asc.get
val note = NCNlpSentenceNote(m.matched.map(_.index), TOK_ID, params: _*)
if (!notes.exists(n ⇒ ns.notesEqualOrSimilar(n, note))) {
notes += note
m.matched.foreach(_.add(note))
}
}
case None// No-op.
}
}
/**
*
* @param toks
*/
private def getCommonNotes(toks: Seq[NCNlpSentenceToken]): Set[String] =
if (toks.isEmpty)
Set.empty
else {
def getCommon(sortedToks: Seq[NCNlpSentenceToken]): Set[String] = {
require(sortedToks.nonEmpty)
val h = sortedToks.head
val l = sortedToks.last
h.filter(!_.isNlp).filter(n ⇒ h.index == n.tokenFrom && l.index == n.tokenTo).map(_.noteType).toSet
}
var sortedToks = toks.sortBy(_.index)
var res = getCommon(sortedToks)
if (res.isEmpty) {
sortedToks = sortedToks.filter(!_.isStopWord)
if (sortedToks.nonEmpty)
res = getCommon(sortedToks)
}
if (res.isEmpty) Set.empty else res
}
/**
*
* @param numsMap
* @param groupsMap
* @param toks
*/
private def tryToMatch(
numsMap: Map[Seq[NCNlpSentenceToken], NCNumeric],
groupsMap: Map[Seq[NCNlpSentenceToken], GroupsHolder],
toks: Seq[NCNlpSentenceToken]
): Option[Match] = {
val i1 = toks.head.index
val i2 = toks.last.index
val refCands = toks.filter(_.exists(n ⇒ isUserNotValue(n) && n.tokenIndexes.head >= i1 && n.tokenIndexes.last <= i2))
// Reference should be last.
if (refCands.nonEmpty && refCands.last.index == toks.last.index) {
val commonRefNotes = getCommonNotes(refCands)
if (commonRefNotes.nonEmpty) {
val matchCands = toks.diff(refCands)
val idxs = refCands.map(_.index)
def try0(group: Seq[NCNlpSentenceToken]): Option[Match] =
groupsMap.get(group) match {
case Some(h)
if (limits.contains(h.value) || h.isFuzzyNum)
Some(Match(h.limit, Some(h.asc), matchCands, commonRefNotes, idxs.asJava))
else
numsMap.get(group) match {
case Some(num)Some(Match(num.value, None, matchCands, commonRefNotes, idxs.asJava))
case NoneNone
}
case NoneNone
}
try0(matchCands) match {
case Some(m)Some(m)
case None ⇒ try0(matchCands.filter(!_.isStopWord))
}
}
else
None
}
else
None
}
/**
*
* @param ns
* @param nums
* @return
*/
private def groupNums(ns: NCNlpSentence, nums: Iterable[NCNumeric]): Map[Seq[NCNlpSentenceToken], GroupsHolder] = {
val numsMap = nums.map(n ⇒ n.tokens → n).toMap
// All groups combinations.
val tks2Nums: Seq[(NCNlpSentenceToken, Option[Int])] = ns.filter(!_.isStopWord).map(t ⇒ t → fuzzyNums.get(t.stem))
// Tokens: A; B; 20; C; twenty; two, D
// NERs : -; -; 20; -; 22; 22; -
// Groups: (A) → -; (B) → -; (20) → 20; (C) → -; (twenty, two) → 22; (D) → -;
val groups: Seq[Group] = tks2Nums.zipWithIndex.groupBy { case ((_, numOpt), idx)
// Groups by artificial flag.
// Flag is first index of independent token.
// Tokens: A; B; 20; C; twenty; two, D
// Indexes 0; 1; 2; 3; 4; 4; 6
if (idx == 0)
0
else {
// Finds last another.
var i = idx
while (i > 0 && numOpt.isDefined && tks2Nums(i - 1)._2 == numOpt)
i = i - 1
i
}
}.
// Converts from artificial group to tokens groups (Seq[Token], Option[Int])
map { case (_, gs) ⇒ gs.map { case (seq, _) ⇒ seq } }.
map(seq ⇒ {
val toks = seq.map { case (t, _) ⇒ t }
var numOpt = seq.head._2
val isFuzzyNum = numOpt.nonEmpty
if (numOpt.isEmpty)
numOpt = numsMap.get(toks) match {
case Some(num)Some(num.value.intValue())
case NoneNone
}
Group(toks, numOpt, isFuzzyNum)
}).
// Converts to sequence and sorts.
toSeq.sortBy(_.index)
(for (n ← groups.length until 0 by -1) yield groups.sliding(n).map(GroupsHolder)).
flatten.
map(p ⇒ p.tokens → p).
toMap
}
}