blob: e331997d0c54f4110412d9e43561a25d4a416814 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort
import java.io.Serializable
import io.opencensus.trace.Span
import org.apache.nlpcraft.common.NCService
import org.apache.nlpcraft.common.makro.NCMacroParser
import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
import org.apache.nlpcraft.probe.mgrs.NCProbeModel
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import scala.collection.{Map, Seq, mutable}
/**
* Sort enricher.
*/
object NCSortEnricher extends NCProbeEnricher {
private final val TOK_ID = "nlpcraft:sort"
object Type extends Enumeration {
type Type = Value
val TYPE_SUBJ_BY, TYPE_SUBJ, TYPE_BY = Value
}
import Type._
// Elements: SORT, BY, ORDER, x.
// Note that SORT, BY, ORDER - are sets of words (not single words)
// x - means one or multiple words. x must be at least one for each line, maximum two.
private final val MASKS: Map[String, Type] =
Map(
"x SORT BY x" → TYPE_SUBJ_BY,
"x SORT BY x ORDER" → TYPE_SUBJ_BY,
"SORT x BY x" → TYPE_SUBJ_BY,
"SORT x BY x ORDER" → TYPE_SUBJ_BY,
"SORT x ORDER BY x" → TYPE_SUBJ_BY,
"x SORT ORDER BY x" → TYPE_SUBJ_BY,
"ORDER SORT x BY x" → TYPE_SUBJ_BY,
"SORT x ORDER" → TYPE_SUBJ,
"SORT x BY ORDER" → TYPE_SUBJ,
"ORDER SORT x" → TYPE_SUBJ,
"SORT x" → TYPE_SUBJ,
"x SORT" → TYPE_SUBJ,
"SORT BY x ORDER" → TYPE_BY,
"SORT BY x" → TYPE_BY,
"ORDER SORT BY x" → TYPE_BY
)
case class NoteData(note: String, indexes: Seq[Int]) {
// Added for debug reasons.
override def toString: String = s"NoteData [note=$note, indexes=[${indexes.mkString(",")}]]"
}
@volatile private var sort: Seq[String] = _
@volatile private var by: Seq[String] = _
@volatile private var order: Seq[(String, Boolean)] = _
@volatile private var stemAnd: String = _
@volatile private var maskWords: Seq[String] = _
private case class Match(
asc: Option[Boolean],
main: Seq[NCNlpSentenceToken],
stop: Seq[NCNlpSentenceToken],
subjSeq: Seq[Seq[NoteData]],
bySeq: Seq[Seq[NoteData]]
) {
require(main.nonEmpty)
require(subjSeq.nonEmpty || bySeq.nonEmpty)
// Added for debug reasons.
override def toString: String = {
def s1[T](seq: Seq[NCNlpSentenceToken]): String = s"[${seq.map(_.origText).mkString(", ")}]"
def s2[T](seq: Seq[NoteData]): String =
s"[${seq.map(p ⇒ s"${p.note}: [${p.indexes.mkString(", ")}]").mkString(", ")}]"
def s3[T](seq: Seq[Seq[NoteData]]): String = s"[${seq.map(s2).mkString(", ")}]"
s"Match [main=${s1(main)}, stop=${s1(stop)}, subjSeq=${s3(subjSeq)}, bySeq=${s3(bySeq)}]"
}
}
/**
*
*/
private def validate() {
// Not duplicated.
require(sort.size + by.size + order.size == (sort ++ by ++ order.map(_._1)).distinct.size)
// Single words.
require(!sort.exists(_.contains(" ")))
require(!by.exists(_.contains(" ")))
// Different words.
require(sort.intersect(by).isEmpty)
require(sort.intersect(order.map(_._1)).isEmpty)
require(by.intersect(order.map(_._1)).isEmpty)
// `Sort by` as one element.
require(MASKS.filter(_._2 == TYPE_BY).keys.forall(_.contains("SORT BY")))
val ordersSeq: Seq[Seq[String]] = order.map(_._1).map(_.split(" ").toSeq)
// ORDER doesn't contain words from BY (it can contain words from SORT).
require(!by.exists(ordersSeq.contains))
// Right order of keywords and references.
MASKS.keys.map(_.split(" ")).foreach(seq ⇒ {
require(seq.forall(p ⇒ p == "SORT" || p == "ORDER" || p == "BY" || p == "x"))
seq.groupBy(p ⇒ p).foreach { case (key, group)
val n = group.length
key match {
case "x" ⇒ require(n == 1 || n == 2)
case _ ⇒ require(n == 1)
}
}
})
}
private def toNoteData(toks: Seq[NCNlpSentenceToken]): Seq[NoteData] = {
require(toks.nonEmpty)
val min = toks.head.index
val max = toks.last.index
toks.flatten.
filter(!_.isNlp).
filter(n ⇒ n.tokenIndexes.head >= min && n.tokenIndexes.last <= max).
map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).
sortBy(_.indexes.head).distinct
}
/**
* [Token] → [NoteData]
* [Token(A, B), Token(A), Token(C, D), Token(C, D, X), Token(Z)] ⇒
* [ [A (0, 1), C (2, 3), Z (4)], [A (0, 1), D (2, 3), Z (4) ] ]
*
* @param toksNoteData
*/
private def split(toks: Seq[NCNlpSentenceToken], toksNoteData: Seq[NoteData], nullable: Boolean): Seq[Seq[NoteData]] = {
val res =
if (toksNoteData.nonEmpty) {
val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
/**
* Returns flag which indicates are token contiguous or not.
*
* @param tok1Idx First token index.
* @param tok2Idx Second token index.
*/
def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
val between = toks.filter(t ⇒ t.index > tok1Idx && t.index < tok2Idx)
between.isEmpty || between.forall(p ⇒ p.isStopWord || p.stem == stemAnd)
}
val minIdx = toks.dropWhile(t ⇒ !isUserNotValue(t)).head.index
val maxIdx = toks.reverse.dropWhile(t ⇒ !isUserNotValue(t)).head.index
require(minIdx <= maxIdx)
def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
seq += nd
toksNoteData.
filter(p ⇒ nd.indexes.last < p.indexes.head && contiguous(nd.indexes.last, p.indexes.head)).
foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
if (seq.nonEmpty && seq.head.indexes.head == minIdx && seq.last.indexes.last == maxIdx)
res += seq
}
toksNoteData.filter(_.indexes.head == minIdx).foreach(p ⇒ fill(p))
res
}
else
Seq.empty
if (res.isEmpty && !nullable)
throw new AssertionError(s"Invalid empty result " +
s"[tokensTexts=[${toks.map(_.origText).mkString("|")}]" +
s", notes=[${toks.flatten.map(n ⇒ s"${n.noteType}:[${n.tokenIndexes.mkString(",")}]").mkString("|")}]" +
s", tokensIndexes=[${toks.map(_.index).mkString("|")}]" +
s", allData=[${toksNoteData.mkString("|")}]" +
s"]"
)
res
}
/**
*
* @param t
*/
private def isUserNotValue(t: NCNlpSentenceToken): Boolean =
t.find(_.isUser) match {
case Some(n)!n.contains("value")
case Nonefalse
}
/**
*
* @param n
*/
private def isUserNotValue(n: NCNlpSentenceNote): Boolean = n.isUser && !n.contains("value")
/**
*
* @param toks
*/
private def tryToMatch(toks: Seq[NCNlpSentenceToken]): Option[Match] = {
require(toks.nonEmpty)
case class KeyWord(tokens: Seq[NCNlpSentenceToken], synonymIndex: Int) {
require(tokens.nonEmpty)
}
def extract(keyStems: Seq[String], used: Seq[NCNlpSentenceToken]): Option[KeyWord] = {
require(keyStems.nonEmpty)
val maxWords = keyStems.map(_.count(_ == ' ')).max + 1
(1 to maxWords).reverse.flatMap(i ⇒
toks.sliding(i).filter(toks ⇒ used.intersect(toks).isEmpty).
map(toks ⇒ toks.map(_.stem).mkString(" ") → toks).toMap.
flatMap { case (stem, stemToks)
if (keyStems.contains(stem)) Some(KeyWord(stemToks, keyStems.indexOf(stem))) else None
}.toStream.headOption
).toStream.headOption
}
var res: Option[Match] = None
// Order is important.
// SORT and ORDER don't have same words (validated)
val orderOpt = extract(order.map(_._1), used = Seq.empty)
val byOpt = extract(by, used = orderOpt.toSeq.flatMap(_.tokens))
val sortOpt = extract(sort, used = orderOpt.toSeq.flatMap(_.tokens) ++ byOpt.toSeq.flatMap(_.tokens))
if (sortOpt.nonEmpty || orderOpt.nonEmpty) {
val sortToks = sortOpt.toSeq.flatMap(_.tokens)
val byToks = byOpt.toSeq.flatMap(_.tokens)
val orderToks = orderOpt.toSeq.flatMap(_.tokens)
val all = sortToks ++ byToks ++ orderToks
def getKeyWordType(t: NCNlpSentenceToken): String =
if (sortToks.contains(t))
"SORT"
else if (byToks.contains(t))
"BY"
else if (orderToks.contains(t))
"ORDER"
else if (isUserNotValue(t))
"x"
else
"-"
val others = toks.filter(t ⇒ !all.contains(t))
if (others.nonEmpty) {
val i1 = others.head.index
val i2 = others.last.index
val othersRefs = others.filter(
t ⇒ t.exists(n ⇒ isUserNotValue(n) && n.tokenIndexes.head >= i1 && n.tokenIndexes.last <= i2)
)
if (
othersRefs.nonEmpty &&
others.filter(p ⇒ !othersRefs.contains(p)).
forall(p ⇒ (p.isStopWord || p.stem == stemAnd) && !maskWords.contains(p.stem))
) {
// It removes duplicates (`SORT x x ORDER x x x` converts to `SORT x ORDER x`)
val mask = toks.map(getKeyWordType).
foldLeft("")((x, y)if (x.endsWith(y)) x else s"$x $y").trim
MASKS.get(mask) match {
case Some(typ)
val sepIdxs = all.
map(_.index).
filter(i ⇒ others.exists(_.index > i) && others.exists(_.index < i)).
sorted
// Divides separated by keywords.
val (part1, part2) =
if (sepIdxs.isEmpty)
(others, Seq.empty)
else
(others.filter(_.index < sepIdxs.head), others.filter(_.index > sepIdxs.last))
require(part1.nonEmpty)
val data1 = toNoteData(part1)
val data2 = if (part2.isEmpty) Seq.empty else toNoteData(part2)
if (data1.nonEmpty || data2.nonEmpty) {
val seq1 =
if (data1.nonEmpty)
split(part1, data1, nullable = false)
else
split(part2, data2, nullable = false)
val seq2 =
if (data1.nonEmpty && data2.nonEmpty)
split(part2, data2, nullable = true)
else
Seq.empty
val asc = orderOpt.flatMap(o ⇒ Some(order(o.synonymIndex)._2))
typ match {
case TYPE_SUBJ ⇒
require(seq1.nonEmpty)
require(seq2.isEmpty)
require(sortToks.nonEmpty)
// Ignores invalid cases.
if (byToks.isEmpty)
res =
Some(
Match(
asc = asc,
main = sortToks,
stop = orderToks,
subjSeq = seq1,
bySeq = Seq.empty
)
)
case TYPE_SUBJ_BY ⇒
require(seq1.nonEmpty)
require(seq2.nonEmpty)
require(sortToks.nonEmpty)
require(byToks.nonEmpty)
res = Some(
Match(
asc = asc,
main = sortToks,
stop = byToks ++ orderToks,
subjSeq = seq1,
bySeq = seq2
)
)
case TYPE_BY ⇒
require(seq1.nonEmpty)
require(seq2.isEmpty)
require(sortToks.nonEmpty)
require(byToks.nonEmpty)
// `Sort by` as one element, see validation.
res = Some(
Match(
asc = asc,
main = sortToks ++ byToks,
stop = orderToks,
subjSeq = Seq.empty,
bySeq = seq1
)
)
case _ ⇒ throw new AssertionError(s"Unexpected type: $typ")
}
}
case None// No-op.
}
}
}
}
res
}
/**
* Checks whether important tokens deleted as stopwords or not.
*
* @param ns Sentence.
* @param toks Tokens in which some stopwords can be deleted.
*/
private def validImportant(ns: NCNlpSentence, toks: Seq[NCNlpSentenceToken]): Boolean = {
def isImportant(t: NCNlpSentenceToken): Boolean = isUserNotValue(t) || maskWords.contains(t.stem)
val idxs = toks.map(_.index)
require(idxs == idxs.sorted)
val toks2 = ns.slice(idxs.head, idxs.last + 1)
toks.length == toks2.length || toks.count(isImportant) == toks2.count(isImportant)
}
override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, meta: Map[String, Serializable], parent: Span): Unit =
startScopedSpan("enrich", parent,
"srvReqId" → ns.srvReqId,
"mdlId" → mdl.model.getId,
"txt" → ns.text) { _ ⇒
val notes = mutable.HashSet.empty[NCNlpSentenceNote]
for (toks ← ns.tokenMixWithStopWords() if validImportant(ns, toks)) {
tryToMatch(toks) match {
case Some(m)
def addNotes(
params: ArrayBuffer[(String, Any)],
seq: Seq[NoteData],
notesName: String,
idxsName: String
): ArrayBuffer[(String, Any)] = {
params += notesName → seq.map(_.note).asJava
params += idxsName → seq.map(_.indexes.asJava).asJava
params
}
def mkNote(params: ArrayBuffer[(String, Any)]): Unit = {
val note = NCNlpSentenceNote(m.main.map(_.index), TOK_ID, params: _*)
if (!notes.exists(n ⇒ ns.notesEqualOrSimilar(n, note))) {
notes += note
m.main.foreach(_.add(note))
m.stop.foreach(_.addStopReason(note))
}
}
def mkParams(): mutable.ArrayBuffer[(String, Any)] = {
val params = mutable.ArrayBuffer.empty[(String, Any)]
if (m.asc.isDefined)
params += "asc" → m.asc.get
params
}
if (m.subjSeq.nonEmpty)
for (subj ← m.subjSeq) {
def addSubj(): ArrayBuffer[(String, Any)] =
addNotes(mkParams(), subj, "subjnotes", "subjindexes")
if (m.bySeq.nonEmpty)
for (by ← m.bySeq)
mkNote(addNotes(addSubj(), by, "bynotes", "byindexes"))
else
mkNote(addSubj())
}
else {
require(m.bySeq.nonEmpty)
for (by ← m.bySeq)
mkNote(addNotes(mkParams(), by, "bynotes", "byindexes"))
}
case None// No-op.
}
}
}
override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
// Single words.
sort =
Seq("sort", "rank", "classify", "order", "arrange", "organize", "segment", "shuffle").map(NCNlpCoreManager.stem)
// Single words.
// Cannot be same as in SORT.
by = Seq("by", "on", "with").map(NCNlpCoreManager.stem)
// Multiple words.
// Cannot be same as in SORT and BY.
// Some words from chunks can be the same as SORT but cannot be same as BY.
order = {
val p = NCMacroParser()
Seq(
"top down"false,
"bottom up"true,
"ascending"true,
"asc"true,
"descending"false,
"desc"false,
"{in|by|from} {top down|descending} {order|way|fashion|*}"false,
"{in|by|from} {bottom up|ascending} {order|way|fashion|*}"true
).flatMap { case (txt, asc) ⇒ p.expand(txt).map(p ⇒ NCNlpCoreManager.stem(p) → asc ) }
}
stemAnd = NCNlpCoreManager.stem("and")
maskWords =
(sort ++ by ++ order.map(_._1)).flatMap(_.split(" ")).map(_.trim).filter(_.nonEmpty).distinct
validate()
super.start()
}
override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
super.stop()
sort = null
by = null
order = null
stemAnd = null
maskWords = null
}
}