| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * https://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.nlpcraft.common.nlp |
| |
| import org.apache.nlpcraft.common._ |
| |
| import java.io.{Serializable => JSerializable} |
| import java.util.{Collections, List => JList} |
| import scala.collection.mutable |
| import scala.jdk.CollectionConverters.CollectionHasAsScala |
| import scala.language.implicitConversions |
| |
| object NCNlpSentence { |
| case class NoteKey(start: Int, end: Int) |
| case class TokenKey(id: String, start: Int, end: Int) |
| case class NoteLink(note: String, indexes: Seq[Int]) |
| } |
| |
| import org.apache.nlpcraft.common.nlp.NCNlpSentence._ |
| |
| /** |
| * Parsed NLP sentence is a collection of tokens. Each token is a collection of notes and |
| * each note is a collection of KV pairs. |
| * |
| * @param srvReqId Server request ID. |
| * @param text Normalized text. |
| * @param enabledBuiltInToks Enabled built-in tokens. |
| * @param tokens Initial buffer. |
| * @param firstProbePhase Processing phase flag. |
| * @param deletedNotes Deleted overridden notes with their tokens. |
| * @param initNlpNotes Initial NLP tokens. |
| * @param nlpTokens NLP tokens. |
| */ |
| class NCNlpSentence( |
| val srvReqId: String, |
| val text: String, |
| val enabledBuiltInToks: Set[String], |
| override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32), |
| var firstProbePhase: Boolean = true, |
| private val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty, |
| private var initNlpNotes: Map[NoteKey, NCNlpSentenceNote] = null, |
| private val nlpTokens: mutable.HashMap[TokenKey, NCNlpSentenceToken] = mutable.HashMap.empty |
| ) extends NCNlpSentenceTokenBuffer(tokens) with JSerializable { |
| @transient |
| private var hash: java.lang.Integer = _ |
| |
| private def calcHash(): Int = U.mkJavaHash(srvReqId, text, enabledBuiltInToks, tokens) |
| |
| // Deep copy. |
| override def clone(): NCNlpSentence = |
| new NCNlpSentence( |
| srvReqId = srvReqId, |
| text = text, |
| enabledBuiltInToks = enabledBuiltInToks, |
| tokens = tokens.map(_.clone()), |
| deletedNotes = deletedNotes.map(p => p._1.clone() -> p._2.map(_.clone())), |
| initNlpNotes = initNlpNotes, |
| nlpTokens = nlpTokens, |
| firstProbePhase = firstProbePhase |
| ) |
| |
| /** |
| * Utility method that gets set of notes for given note type collected from |
| * tokens in this sentence. Notes are sorted in the same order they appear |
| * in this sentence. |
| * |
| * @param noteType Note type. |
| */ |
| def getNotes(noteType: String): Seq[NCNlpSentenceNote] = this.flatMap(_.getNotes(noteType)).toSeq.distinct |
| |
| /** |
| * Utility method that removes note with given ID from all tokens in this sentence. |
| * No-op if such note wasn't found. |
| * |
| * @param note Note. |
| */ |
| def removeNote(note: NCNlpSentenceNote): Unit = this.foreach(_.remove(note)) |
| |
| //noinspection HashCodeUsesVar |
| override def hashCode(): Int = { |
| if (hash == null) |
| hash = calcHash() |
| |
| hash |
| } |
| |
| override def equals(obj: Any): Boolean = obj match { |
| case x: NCNlpSentence => |
| tokens == x.tokens && |
| srvReqId == x.srvReqId && |
| text == x.text && |
| enabledBuiltInToks == x.enabledBuiltInToks |
| |
| case _ => false |
| } |
| |
| /** |
| * |
| * @param note |
| * @param kvs |
| */ |
| def fixNote(note: NCNlpSentenceNote, kvs: (String, JSerializable)*): Unit = { |
| val fixed = note.clone(kvs: _*) |
| |
| this.filter(t => t.index >= fixed.tokenIndexes.head && t.index <= fixed.tokenIndexes.last).foreach(t => { |
| t.remove(note) |
| t.add(fixed) |
| }) |
| |
| hash = null |
| } |
| |
| /** |
| * Returns flag are note notes equal (or similar) or not. Reason of ignored difference can be stopwords tokens. |
| * |
| * @param n1 First note. |
| * @param n2 Second note. |
| */ |
| def notesEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = |
| if (n1.noteType != n2.noteType) |
| false |
| else { |
| val stopIdxs = this.filter(_.isStopWord).map(_.index) |
| |
| // One possible difference - stopwords indexes. |
| def wordsEqualOrSimilar0(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = { |
| val set1 = n1.wordIndexes.toSet |
| val set2 = n2.wordIndexes.toSet |
| |
| set1 == set2 || set1.subsetOf(set2) && set2.diff(set1).forall(stopIdxs.contains) |
| } |
| |
| def wordsEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = |
| wordsEqualOrSimilar0(n1, n2) || wordsEqualOrSimilar0(n2, n1) |
| |
| def tokensEqualOrSimilar0(set1: Set[NCNlpSentenceToken], set2: Set[NCNlpSentenceToken]): Boolean = |
| set1 == set2 || set1.subsetOf(set2) && set2.diff(set1).forall(_.isStopWord) |
| |
| def tokensEqualOrSimilar(set1: Set[NCNlpSentenceToken], set2: Set[NCNlpSentenceToken]): Boolean = |
| tokensEqualOrSimilar0(set1, set2) || tokensEqualOrSimilar0(set2, set1) |
| |
| def getList(n: NCNlpSentenceNote, refIdxName: String): Set[NCNlpSentenceToken] = |
| n.getOrElse(refIdxName, Collections.emptyList).asInstanceOf[JList[Int]].asScala. |
| map(this (_)).toSet |
| |
| def getListList(n: NCNlpSentenceNote, refIdxName: String): Set[NCNlpSentenceToken] = |
| n.getOrElse(refIdxName, Collections.emptyList).asInstanceOf[JList[JList[Int]]].asScala. |
| flatMap(_.asScala.map(this (_))).toSet |
| |
| def referencesEqualOrSimilar0(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = { |
| require(n1.noteType == n2.noteType) |
| |
| n1.noteType match { |
| case "nlpcraft:sort" => |
| tokensEqualOrSimilar(getListList(n1, "subjindexes"), getListList(n2, "subjindexes")) && |
| tokensEqualOrSimilar(getListList(n1, "byindexes"), getListList(n2, "byindexes")) |
| case "nlpcraft:limit" => |
| tokensEqualOrSimilar(getList(n1, "indexes"), getList(n2, "indexes")) |
| case "nlpcraft:reference" => |
| tokensEqualOrSimilar(getList(n1, "indexes"), getList(n2, "indexes")) |
| |
| case _ => true |
| } |
| } |
| |
| def referencesEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = |
| referencesEqualOrSimilar0(n1, n2) || referencesEqualOrSimilar0(n2, n1) |
| |
| def getUniqueKey0(n: NCNlpSentenceNote): Seq[Any] = n.getKey(withIndexes = false, withReferences = false) |
| |
| getUniqueKey0(n1) == getUniqueKey0(n2) && wordsEqualOrSimilar(n1, n2) && referencesEqualOrSimilar(n1, n2) |
| } |
| |
| /** |
| * |
| */ |
| def saveNlpNotes(): Unit = |
| initNlpNotes = this.map(t => NoteKey(t.startCharIndex, t.endCharIndex) -> t.getNlpNote).toMap |
| |
| /** |
| * |
| * @return |
| */ |
| def getInitialNlpNote(startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceNote] = |
| initNlpNotes.get(NoteKey(startCharIndex, endCharIndex)) |
| |
| /** |
| * |
| * @param nlp |
| */ |
| def addNlpToken(nlp: NCNlpSentenceToken): Unit = { |
| require(nlp.size <= 2) |
| |
| nlp.foreach(n => nlpTokens += TokenKey(n.noteType, nlp.startCharIndex, nlp.endCharIndex) -> nlp) |
| } |
| |
| /** |
| * |
| * @param noteType |
| * @param startCharIndex |
| * @param endCharIndex |
| * @return |
| */ |
| def getNlpToken(noteType: String, startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceToken] = |
| nlpTokens.get(TokenKey(noteType, startCharIndex, endCharIndex)) |
| |
| /** |
| * |
| */ |
| def getDeletedNotes: Predef.Map[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = deletedNotes.toMap |
| |
| /*** |
| * |
| * @param deletedNotes |
| */ |
| def addDeletedNotes(deletedNotes: Map[NCNlpSentenceNote, Seq[NCNlpSentenceToken]]): Unit = |
| this.deletedNotes ++= deletedNotes |
| } |