blob: c3707386a7ff3e3963825ddab40da33719e83d45 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nlpcraft.probe.mgrs
import org.apache.nlpcraft.common.U
import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, NCNlpSentenceTokenBuffer}
import org.apache.nlpcraft.model._
import org.apache.nlpcraft.model.intent.NCIdlContext
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind._
import scala.collection.mutable
/**
*
* @param isElementId Is this an implicit element ID synonym?
* In this case chunks contain the element ID.
* @param isValueName Is this an implicit value name synonym?
* In this case chunks contain value name.
* @param isDirect Direct or permuted synonym flag.
* @param value Optional value name if this is a value synonym.
* @param sparse Flag.
* @param permute Flag.
*/
class NCProbeSynonym(
val isElementId: Boolean,
val isValueName: Boolean,
val isDirect: Boolean,
val value: String = null,
val sparse: Boolean,
val permute: Boolean
) extends mutable.ArrayBuffer[NCProbeSynonymChunk] with Ordered[NCProbeSynonym] {
require((isElementId && !isValueName && value == null) || !isElementId)
require((isValueName && value != null) || !isValueName)
lazy val isTextOnly: Boolean = forall(_.kind == TEXT)
lazy val regexChunks: Int = count(_.kind == REGEX)
lazy val idlChunks: Int = count(_.kind == IDL)
lazy val hasIdl: Boolean = idlChunks != 0
lazy val isValueSynonym: Boolean = value != null
lazy val stems: String = map(_.wordStem).mkString(" ")
lazy val stemsHash: Int = stems.hashCode
/**
*
* @param kind
* @return
*/
private def getSort(kind: NCSynonymChunkKind): Int =
kind match {
case TEXT => 0
case IDL => 1
case REGEX => 2
case _ => throw new AssertionError(s"Unexpected kind: $kind")
}
/**
*
* @param tok
* @param chunk
*/
private def isMatch(tok: NCNlpSentenceToken, chunk: NCProbeSynonymChunk): Boolean =
chunk.kind match {
case TEXT => chunk.wordStem == tok.stem
case REGEX =>
val regex = chunk.regex
regex.matcher(tok.origText).matches() || regex.matcher(tok.normText).matches()
case IDL => throw new AssertionError()
case _ => throw new AssertionError()
}
/**
*
* @param toks
* @param isMatch
* @param getIndex
* @param shouldBeNeighbors
* @tparam T
* @return
*/
private def sparseMatch0[T](
toks: Seq[T],
isMatch: (T, NCProbeSynonymChunk) => Boolean,
getIndex: T => Int,
shouldBeNeighbors: Boolean
): Option[Seq[T]] =
if (toks.size >= this.size) {
lazy val res = mutable.ArrayBuffer.empty[T]
lazy val all = mutable.HashSet.empty[T]
var state = 0
for (chunk <- this if state != -1) {
val seq =
if (state == 0) {
state = 1
toks.filter(t => isMatch(t, chunk))
}
else
toks.filter(t => !res.contains(t) && isMatch(t, chunk))
if (seq.nonEmpty) {
val head = seq.head
if (!permute && res.nonEmpty && getIndex(head) <= getIndex(res.last))
state = -1
else {
all ++= seq
if (all.size > this.size)
state = -1
else
res += head
}
}
else
state = -1
}
if (state != -1 && all.size == res.size && (!shouldBeNeighbors || U.isIncreased(res.map(getIndex).toSeq.sorted)))
Some(res.toSeq)
else
None
}
else
None
/**
*
* @param tow
* @param chunk
* @param req
*/
private def isMatch(tow: NCIdlContent, chunk: NCProbeSynonymChunk, req: NCRequest): Boolean = {
def get0[T](fromToken: NCToken => T, fromWord: NCNlpSentenceToken => T): T =
if (tow.isLeft) fromToken(tow.swap.toOption.get) else fromWord(tow.toOption.get)
chunk.kind match {
case TEXT => chunk.wordStem == get0(_.stem, _.stem)
case REGEX =>
val r = chunk.regex
r.matcher(get0(_.origText, _.origText)).matches() || r.matcher(get0(_.normText, _.normText)).matches()
case IDL =>
get0(t => chunk.idlPred.apply(t, NCIdlContext(req = req)).value.asInstanceOf[Boolean], _ => false)
case _ => throw new AssertionError()
}
}
/**
*
* @param toks
*/
def isMatch(toks: NCNlpSentenceTokenBuffer): Boolean = {
require(toks != null)
require(!sparse && !hasIdl)
if (toks.length == length) {
if (isTextOnly)
toks.stemsHash == stemsHash && toks.stems == stems
else
toks.zip(this).sortBy(p => getSort(p._2.kind)).forall { case (tok, chunk) => isMatch(tok, chunk) }
}
else
false
}
/**
*
* @param tows
* @param req
* @return
*/
def isMatch(tows: Seq[NCIdlContent], req: NCRequest): Boolean = {
require(tows != null)
if (tows.length == length && tows.count(_.isLeft) >= idlChunks)
tows.zip(this).sortBy(p => getSort(p._2.kind)).forall { case (tow, chunk) => isMatch(tow, chunk, req) }
else
false
}
/**
*
* @param toks
*/
def sparseMatch(toks: NCNlpSentenceTokenBuffer): Option[Seq[NCNlpSentenceToken]] = {
require(toks != null)
require(sparse && !hasIdl)
sparseMatch0(toks.toSeq, isMatch, (t: NCNlpSentenceToken) => t.startCharIndex, shouldBeNeighbors = false)
}
/**
*
* @param tows
* @param req
*/
def sparseMatch(tows: Seq[NCIdlContent], req: NCRequest): Option[Seq[NCIdlContent]] = {
require(tows != null)
require(req != null)
require(hasIdl)
sparseMatch0(
tows,
(t: NCIdlContent, chunk: NCProbeSynonymChunk) => isMatch(t, chunk, req),
(t: NCIdlContent) => if (t.isLeft) t.swap.toOption.get.getStartCharIndex else t.toOption.get.startCharIndex,
shouldBeNeighbors = !sparse
)
}
override def toString(): String = mkString(" ")
// Orders synonyms from least to most significant.
override def compare(that: NCProbeSynonym): Int = {
require(hasIdl || sparse == that.sparse, s"Invalid comparing [this=$this, that=$that]")
def compareIsValueSynonym(): Int =
isValueSynonym match {
case true if !that.isValueSynonym => 1
case false if that.isValueSynonym => -1
case _ => 0
}
if (that == null)
1
else
isElementId match {
case true if !that.isElementId => 1
case false if that.isElementId => -1
case true if that.isElementId => 0
case _ => // None are element IDs.
if (length > that.length)
1
else if (length < that.length)
-1
else { // Equal length in chunks.
if (isDirect && !that.isDirect)
1
else if (!isDirect && that.isDirect)
-1
else if (permute && !that.permute)
-1
else if (!permute && that.permute)
1
else // Both direct or indirect.
isTextOnly match {
case true if !that.isTextOnly => 1
case false if that.isTextOnly => -1
case true if that.isTextOnly => compareIsValueSynonym()
case _ =>
val thisDynCnt = regexChunks + idlChunks
val thatDynCnt = that.regexChunks + that.idlChunks
// Less PoS/regex/IDL chunks means less uncertainty, i.e. larger weight.
if (thisDynCnt < thatDynCnt)
1
else if (thisDynCnt > thatDynCnt)
-1
else
0
}
}
}
}
override def canEqual(other: Any): Boolean = other.isInstanceOf[NCProbeSynonym]
override def equals(other: Any): Boolean = other match {
case that: NCProbeSynonym =>
super.equals(that) &&
(that canEqual this) &&
isTextOnly == that.isTextOnly &&
regexChunks == that.regexChunks &&
idlChunks == that.idlChunks &&
isValueSynonym == that.isValueSynonym &&
isElementId == that.isElementId &&
isValueName == that.isValueName &&
value == that.value
case _ => false
}
override def hashCode(): Int = {
val state = Seq(
super.hashCode(),
isTextOnly,
regexChunks,
idlChunks,
isValueSynonym,
isElementId,
isValueName,
value
)
state.map(p => if (p == null) 0 else p.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
}
}
object NCProbeSynonym {
type NCIdlContent = Either[NCToken, NCNlpSentenceToken]
/**
*
* @param isElementId
* @param isValueName
* @param isDirect
* @param value
* @param chunks
* @param sparse
* @param permute
*/
def apply(
isElementId: Boolean,
isValueName: Boolean,
isDirect: Boolean,
value: String,
chunks: Seq[NCProbeSynonymChunk],
sparse: Boolean,
permute: Boolean
): NCProbeSynonym = {
val syn = new NCProbeSynonym(isElementId, isValueName, isDirect, value, sparse, permute)
syn ++= chunks
syn
}
}