blob: 6f379b613531907d6510023d52ba5467f17d00f6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.relation
import java.io.Serializable
import io.opencensus.trace.Span
import org.apache.nlpcraft.common.makro.NCMacroParser
import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
import org.apache.nlpcraft.common.{NCE, NCService}
import org.apache.nlpcraft.probe.mgrs.NCProbeModel
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
import scala.collection.JavaConverters._
import scala.collection.{Map, Seq, mutable}
/**
* Relation enricher.
*/
object NCRelationEnricher extends NCProbeEnricher {
case class Holder(funcType: String, headStem: String, allStems: Set[String])
case class Match(
funcType: String,
matched: Seq[NCNlpSentenceToken],
matchedHead: NCNlpSentenceToken,
refNotes: Set[String],
refIndexes: java.util.List[Int]
)
case class Reference(tokens: Seq[NCNlpSentenceToken], types: Set[String]) {
require(tokens.nonEmpty)
require(types.nonEmpty)
}
private final val TOK_ID = "nlpcraft:relation"
private final val REL_TYPES = Seq(
"nlpcraft:continent",
"nlpcraft:subcontinent",
"nlpcraft:country",
"nlpcraft:metro",
"nlpcraft:region",
"nlpcraft:city",
"nlpcraft:date"
)
private var FUNCS: Seq[Holder] = _
private var ALL_FUNC_STEMS: Set[String] = _
/**
* Starts this component.
*/
override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
val macros = NCMacroParser()
FUNCS = {
val seq = mutable.ArrayBuffer.empty[Holder]
/**
*
* @param f Function ID.
* @param head Mandatory "start" word(s).
* @param tails All possible "connecting" words (zero or more can be in the sentence).
*/
def add(f: String, head: String, tails: Seq[String]): Unit = {
require(!head.contains(" "))
val tailsSeq = tails.toSet.map(NCNlpCoreManager.stem)
macros.expand(head).map(NCNlpCoreManager.stem).foreach(s ⇒ seq += Holder(f, s, tailsSeq + s))
}
val tails = Seq(
"to",
"and",
"for",
"with",
"in",
"of",
"between",
"statistic",
"stats",
"info",
"metrics",
"information",
"data",
"vs",
"versus"
)
add("compare", "{compare|comparing|contrast|difference|match|vs|versus}", tails)
add("correlate", "{correlate|correlation|relation|relationship}", tails)
seq.sortBy(-_.allStems.size)
}
ALL_FUNC_STEMS = FUNCS.flatMap(_.allStems).toSet
super.start()
}
override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
super.stop()
}
/**
* Checks whether important tokens deleted as stopwords or not.
*
* @param ns Sentence.
* @param toks Tokens in which some stopwords can be deleted.
*/
private def validImportant(ns: NCNlpSentence, toks: Seq[NCNlpSentenceToken]): Boolean = {
def isImportant(t: NCNlpSentenceToken): Boolean =
t.exists(n ⇒ n.isUser || REL_TYPES.contains(n.noteType)) || ALL_FUNC_STEMS.contains(t.stem)
val idxs = toks.map(_.index)
require(idxs == idxs.sorted)
val toks2 = ns.slice(idxs.head, idxs.last + 1)
toks.length == toks2.length || toks.count(isImportant) == toks2.count(isImportant)
}
@throws[NCE]
override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit =
startScopedSpan("enrich", parent,
"srvReqId" → ns.srvReqId,
"mdlId" → mdl.model.getId,
"txt" → ns.text) { _ ⇒
// Tries to grab tokens direct way.
// Example: A, B, C ⇒ ABC, AB, BC .. (AB will be processed first)
val notes = mutable.HashSet.empty[NCNlpSentenceNote]
for (toks ← ns.tokenMixWithStopWords() if validImportant(ns, toks))
tryToMatch(toks) match {
case Some(m)
for (refNote ← m.refNotes) {
val note = NCNlpSentenceNote(
Seq(m.matchedHead.index),
TOK_ID,
"type" → m.funcType,
"indexes" → m.refIndexes,
"note" → refNote
)
if (!notes.exists(n ⇒ ns.notesEqualOrSimilar(n, note))) {
notes += note
m.matched.filter(_ != m.matchedHead).foreach(_.addStopReason(note))
m.matchedHead.add(note)
}
}
case None// No-op.
}
}
/**
*
* @param toks
*/
private def getReference(toks: Seq[NCNlpSentenceToken]): Option[Reference] =
if (toks.isEmpty)
None
else {
val sortedToks = toks.sortBy(_.index)
val i1 = sortedToks.head.index
val i2 = sortedToks.last.index
val notes =
sortedToks.flatMap(_.filter(n ⇒ n.isUser || REL_TYPES.contains(n.noteType))).
// Finds notes for tokens related to only given tokens.
filter(n ⇒ n.tokenFrom >= i1 && n.tokenTo <= i2)
val suitNotes =
notes.
filter(n1 ⇒ notes.exists(
n2 ⇒ n2.tokenFrom > n1.tokenTo || n2.tokenTo < n1.tokenFrom)
).
groupBy(_.noteType).
flatMap { case (_, ns)if (ns.size > 1) ns else Seq.empty }
if (suitNotes.nonEmpty)
Some(
Reference(
toks.filter(t ⇒ suitNotes.exists(t.contains)),
suitNotes.map(_.noteType).toSet
)
)
else
None
}
/**
*
* @param toks
*/
private def tryToMatch(toks: Seq[NCNlpSentenceToken]): Option[Match] = {
val i1 = toks.head.index
val i2 = toks.last.index
var refOpts = toks.
filter(t ⇒
t.exists(n ⇒ (
n.isUser || REL_TYPES.contains(n.noteType)) &&
n.tokenIndexes.head >= i1 &&
n.tokenIndexes.last <= i2
)
)
val matchOpts = toks.diff(refOpts)
if (refOpts.nonEmpty && matchOpts.nonEmpty)
getReference(refOpts) match {
case Some(r)
refOpts = r.tokens
def try0(stems: Set[String]): Option[Match] =
FUNCS.
flatMap(h ⇒
if (stems.subsetOf(h.allStems) && stems.contains(h.headStem)) Some(h) else None
).
headOption match {
case Some(h)
Some(
Match(
h.funcType,
matchOpts,
matchOpts.
find(_.stem == h.headStem).
getOrElse(throw new AssertionError("Missed token.")),
r.types,
refOpts.map(_.index).asJava
)
)
case NoneNone
}
try0(matchOpts.map(_.stem).toSet) match {
case Some(m)Some(m)
case None ⇒ try0(matchOpts.filter(!_.isStopWord).map(_.stem).toSet)
}
case NoneNone
}
else
None
}
}