nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala - incubator-nlpcraft - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      https://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model

 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common._
 import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
 import org.apache.nlpcraft.model._
 import org.apache.nlpcraft.model.impl.NCTokenImpl
 import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
 import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
 import org.apache.nlpcraft.probe.mgrs.synonyms.NCSynonymsManager
 import org.apache.nlpcraft.probe.mgrs.{NCProbeIdlToken => IdlToken, NCProbeModel, NCProbeVariants, NCTokenPartKey, NCProbeSynonym => Synonym}

 import java.io.Serializable
 import java.util.{List => JList}
 import scala.collection.mutable
 import scala.collection.parallel.CollectionConverters._
 import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, MapHasAsScala, SeqHasAsJava}

 /**
   * Model elements enricher.
   */
 object NCModelEnricher extends NCProbeEnricher {
     type TokType = (NCToken, NCSynonymChunkKind)

     object IdlTokensSeq {
         def apply(all: Seq[IdlToken]): IdlTokensSeq = IdlTokensSeq(all.filter(_.isToken), all.flatMap(_.wordIndexes).toSet)
     }

     case class IdlTokensSeq(tokens: Seq[IdlToken], wordsIndexes: Set[Int]) {
         private val (idxsSet: Set[Int], minIndex: Int, maxIndex: Int) = {
             val seq = tokens.flatMap(_.wordIndexes).distinct.sorted

             (seq.toSet, seq.head, seq.last)
         }

         def isIntersect(minIndex: Int, maxIndex: Int, idxsSet: Set[Int]): Boolean =
             if (this.minIndex > maxIndex || this.maxIndex < minIndex)
                 false
             else
                 this.idxsSet.exists(idxsSet.contains)

         override def toString: String = tokens.mkString(" | ")
     }

     case class IdlTokensHolder(tokens: Seq[IdlToken], seqs: Seq[IdlTokensSeq])

     /**
       *
       * @param parent Optional parent span.
       * @return
       */
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ =>
         ackStarting()
         ackStarted()
     }

     /**
       *
       * @param parent Optional parent span.
       */
     override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ =>
         ackStopping()
         ackStopped()
     }

     def isComplex(mdl: NCProbeModel): Boolean = mdl.hasIdlSynonyms || !mdl.model.getParsers.isEmpty

     /**
       *
       * @param ns
       * @param elemId
       * @param toks
       * @param direct
       * @param syn
       * @param parts
       * @param metaOpt
       */
     private def mark(
         ns: Sentence,
         elemId: String,
         toks: Seq[NlpToken],
         direct: Boolean,
         syn: Option[Synonym] = None,
         parts: Seq[TokType] = Seq.empty,
         metaOpt: Option[Map[String, Object]] = None
     ): Unit = {
         val params = mutable.ArrayBuffer.empty[(String, AnyRef)]

         // For system elements.
         params += "direct" -> direct.asInstanceOf[AnyRef]

         syn match {
             case Some(s) =>
                 if (s.isValueSynonym)
                     params += "value" -> s.value
             case None => // No-op.
         }

         metaOpt match {
             case Some(meta) => params += "meta" -> meta
             case None => // No-op.
         }

         if (parts.nonEmpty)
             params += "parts" -> parts.map { case (p, kind) => NCTokenPartKey(p, kind) }.asJava

         val idxs = toks.map(_.index).sorted

         val note = NlpNote(idxs, elemId, params: _*)

         toks.foreach(_.add(note))

         // For NLP elements.
         toks.foreach(t => ns.fixNote(t.getNlpNote, "direct" -> direct))
     }

     /**
       *
       * @param mdl
       * @param ns
       * @param span
       * @param req
       */
     private def processParsers(mdl: NCProbeModel, ns: Sentence, span: Span, req: NCRequestImpl): Unit = {
         for (parser <- mdl.model.getParsers.asScala) {
             parser.onInit()

             startScopedSpan("customParser", span,
                 "srvReqId" -> ns.srvReqId,
                 "mdlId" -> mdl.model.getId,
                 "txt" -> ns.text
             ) { _ =>
                 def to(t: NlpToken): NCCustomWord =
                     new NCCustomWord {
                         override def getNormalizedText: String = t.normText
                         override def getOriginalText: String = t.origText
                         override def getStartCharIndex: Int = t.startCharIndex
                         override def getEndCharIndex: Int = t.endCharIndex
                         override def getPos: String = t.pos
                         override def getPosDescription: String = t.posDesc
                         override def getLemma: String = t.lemma
                         override def getStem: String = t.stem
                         override def isStopWord: Boolean = t.isStopWord
                         override def isBracketed: Boolean = t.isBracketed
                         override def isQuoted: Boolean = t.isQuoted
                         override def isKnownWord: Boolean = t.isKnownWord
                         override def isSwearWord: Boolean = t.isSwearWord
                         override def isEnglish: Boolean = t.isEnglish
                     }

                 val res = parser.parse(
                     req,
                     mdl.model,
                     ns.map(to).asJava,
                     ns.flatten.distinct.filter(!_.isNlp).map(n => {
                         val noteId = n.noteType
                         val words = ns.filter(t => n.tokenIndexes.contains(t.index)).map(to).asJava
                         val md = n.asMetadata()

                         new NCCustomElement() {
                             override def getElementId: String = noteId
                             override def getWords: JList[NCCustomWord] = words
                             override def getMetadata: JavaMeta = md.map(p => p._1 -> p._2.asInstanceOf[AnyRef]).asJava
                         }
                     }).asJava
                 )

                 if (res != null)
                     res.asScala.foreach(e => {
                         val elmId = e.getElementId
                         val words = e.getWords

                         if (elmId == null)
                             throw new NCE(s"Custom model parser cannot return 'null' element ID.")

                         if (words == null || words.isEmpty)
                             throw new NCE(s"Custom model parser cannot return empty custom tokens for element: $elmId")

                         val matchedToks = words.asScala.map(w =>
                             ns.find(t =>
                                 t.startCharIndex == w.getStartCharIndex && t.endCharIndex == w.getEndCharIndex
                             ).getOrElse(throw new AssertionError(s"Custom model parser returned an invalid custom token: $w"))
                         )

                         // Checks element's tokens.
                         if (!alreadyMarked(ns, elmId, matchedToks, matchedToks.map(_.index).sorted))
                             mark(
                                 ns,
                                 elemId = elmId,
                                 toks = matchedToks,
                                 direct = true,
                                 metaOpt = Some(e.getMetadata.asScala.toMap)
                             )
                     })
             }

             parser.onDiscard()
         }
     }

     /**
       *
       * 1. Prepares combination of tokens (sliding).
       *  Example: 'A B C D' -> {'A B C', 'A B', 'B C', 'A', 'B', 'C'}
       *  One sentence converted to 4 pieces.
       *
       * 2. Additionally, each piece converted into set of elements with all possible its stopwords permutations.
       *  Example: Piece: 'x1, x2(stopword), x3(stopword), x4' will be expanded  into
       *  {'x1, x2, x3, x4', 'x1, x2, x4', 'x1, x3, x4', 'x1, x4'}
       *
       *  3. All variants collected, duplicated deleted, etc.
       *
       * @param toks
       */
     private def combosTokens(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], Seq[NlpToken])] =
         combos(toks).flatMap(combo => {
             val stops = combo.filter(s => s.isStopWord && s != combo.head && s != combo.last)

             val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[NlpToken]]

             for (stop <- stops)
                 if (slides.nonEmpty && slides.last.last.index + 1 == stop.index)
                     slides.last += stop
                 else
                     slides += mutable.ArrayBuffer.empty :+ stop

             // Too many stopords inside skipped.
             val bigSlides = slides.filter(_.size > 2)

             var stops4Delete: Seq[Seq[NlpToken]] =
                 if (bigSlides.nonEmpty) {
                     val allBig = bigSlides.flatten
                     val stops4AllCombs = stops.filter(p => !allBig.contains(p))

                     if (stops4AllCombs.nonEmpty)
                         for (
                             seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
                             seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)
                         )
                         yield seq1 ++ seq2.flatten
                     else
                         for (seq <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations))
                             yield seq.toSeq.flatten
                 }
                 else
                     Range.inclusive(1, stops.size).flatMap(stops.combinations)

             stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last))

             (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct

         }).
             filter(_._1.nonEmpty).
             groupBy(_._1).
             map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))).
             sortBy { case(data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) }

     /**
       *
       * @param toks
       */
     private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
         (for (n <- toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p => p)

     /**
       *
       * @param seq
       * @param s
       */
     private def toParts(mdl: NCProbeModel, stvReqId: String, seq: Seq[IdlToken], s: Synonym): Seq[TokType] =
         seq.zip(s.map(_.kind)).flatMap {
             case (idlTok, kind) =>
                 val t = if (idlTok.isToken) idlTok.token else mkNlpToken(mdl, stvReqId, idlTok.word)

                 Some(t -> kind)
         }

     /**
       *
       * @param tows
       * @param ns
       */
     private def toTokens(tows: Seq[IdlToken], ns: Sentence): Seq[NlpToken] =
         (
             tows.filter(_.isWord).map(_.word) ++
                 tows.filter(_.isToken).map(_.token).
                     flatMap(w => ns.filter(t => t.wordIndexes.intersect(w.wordIndexes).nonEmpty))
         ).sortBy(_.startCharIndex)

     /**
       *
       * @param m
       * @param id
       */
     private def get(m: Map[String , Seq[Synonym]], id: String): Seq[Synonym] = m.getOrElse(id, Seq.empty)

     /**
       * Gets synonyms sorted in descending order by their weight (already prepared),
       * i.e. first synonym in the sequence is the most important one.
       *
       * @param fastMap {Element ID -> {Synonym length -> T}}
       * @param elmId
       * @param len
       */
     private def fastAccess[T](fastMap: Map[String, Map[Int, T]], elmId: String, len: Int): Option[T] =
         fastMap.getOrElse(elmId, Map.empty[Int, T]).get(len)

     /**
       *
       * @param mdl
       * @param ns
       */
     private def mkHolder(mdl: NCProbeModel, ns: Sentence): IdlTokensHolder = {
         val toks = ns.map(IdlToken(_))

         val seqs =
             NCProbeVariants.convert(ns.srvReqId, mdl, NCSentenceManager.collapse(mdl.model, ns.clone())).
                 map(_.asScala).
                 par.
                 flatMap(sen =>
                     // Tokens splitting.
                     // For example sentence "A B С D E" (5 words) processed as 3 tokens on first phase after collapsing
                     //  'A B' (2 words), 'C D' (2 words) and 'E' (1 word)
                     //  So, result combinations will be:
                     //  Token(AB) + Token(CD) + Token(E)
                     //  Token(AB) + Word(C) + Word(D) + Token(E)
                     //  Word(A) + Word(B) + Token(CD) + Token(E)
                     //  Word(A) + Word(B) + Word(C) + Word(D) + Token(E)
                     combos(sen).
                         map(senPartComb => {
                             sen.flatMap(t =>
                                 // Single word token is not split as words - token.
                                 // Partly (not strict in) token - word.
                                 if (t.wordIndexes.length == 1 || senPartComb.contains(t))
                                     Seq(IdlToken(t))
                                 else
                                     t.wordIndexes.map(toks)
                             )
                             // Drops without tokens (IDL part works with tokens).
                         }).filter(_.exists(_.isToken)).map(IdlTokensSeq(_)).distinct
                 ).seq

         IdlTokensHolder(toks, seqs)
     }

     /**
       *
       * @param mdl
       * @param srvReqId
       * @param t
       */
     private def mkNlpToken(mdl: NCProbeModel, srvReqId: String, t: NlpToken): NCToken = {
         val notes = mutable.HashSet.empty[NlpNote]

         notes += t.getNlpNote

         NCTokenImpl(mdl, srvReqId, NlpToken(t.index, notes, t.stopsReasons))
     }

     /**
       *
       * @param h
       * @param toks
       */
     private def mkCombinations(h: IdlTokensHolder, toks: Seq[NlpToken]): Seq[Seq[IdlToken]] = {
         val idxs = toks.flatMap(_.wordIndexes).toSet

         h.seqs.par.
             flatMap(seq => {
                 val rec = seq.tokens.filter(_.wordIndexes.exists(idxs.contains))

                 // Drops without tokens (IDL part works with tokens).
                 if (rec.nonEmpty)
                     Some(rec ++
                         (seq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.tokens)
                     )
                 else
                     None
             }).seq
     }

     /**
       *
       * @param matched
       * @param toks2Match
       */
     private def getSparsedTokens(matched: Seq[NlpToken], toks2Match: Seq[NlpToken]): Seq[NlpToken] = {
         require(matched.nonEmpty)

         // Matched tokens should be already sorted.
         val stopsInside = toks2Match.filter(t =>
             t.isStopWord && !matched.contains(matched) && t.index > matched.head.index && t.index < matched.last.index
         )

         if (stopsInside.nonEmpty) (matched ++ stopsInside).sortBy(_.index) else matched
     }

     @throws[NCE]
     override def enrich(mdl: NCProbeModel, ns: Sentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = {
         require(isStarted)

         startScopedSpan(
             "enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
         ) { span =>
             val req = NCRequestImpl(senMeta, ns.srvReqId)

             lazy val ch = mkHolder(mdl, ns)
             lazy val variantsToks =
                 ch.seqs.map(
                     p => p.tokens.map(p => if (p.isToken) p.token else mkNlpToken(mdl, ns.srvReqId, p.word))
                 )

             def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
                 startScopedSpan(
                     "execute", span, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
                 ) { _ =>
                     if (DEEP_DEBUG)
                         logger.trace(s"Execution started [simpleEnabled=$simpleEnabled, idlEnabled=$idlEnabled]")

                     for (
                         // 'toksExt' is piece of sentence, 'toks' is the same as 'toksExt' or without some stopwords set.
                         (toks, toksExt) <- combosTokens(ns.toSeq);
                         idxs = toks.map(_.index);
                         e <- mdl.elements.values;
                         elemId = e.getId;
                         greedy = e.isGreedy.orElse(mdl.model.isGreedy)
                         if !greedy || !alreadyMarked(ns, elemId, toks, idxs)
                     ) {
                         def add(
                             dbgType: String,
                             elemToks: Seq[NlpToken],
                             syn: Synonym,
                             parts: Seq[TokType] = Seq.empty
                         ): Unit = {
                             val resIdxs = elemToks.map(_.index)

                             val ok =
                                 (!greedy || !alreadyMarked(ns, elemId, elemToks, idxs)) &&
                                  ( parts.isEmpty || !parts.exists { case (t, _) => t.getId == elemId })

                             if (ok)
                                 mark(
                                     ns,
                                     elemId,
                                     elemToks,
                                     direct = syn.isDirect && U.isIncreased(resIdxs),
                                     syn = Some(syn),
                                     parts = parts
                                 )

                             if (DEEP_DEBUG)
                                 logger.trace(
                                     s"${if (ok) "Added" else "Skipped"} element [" +
                                         s"id=$elemId, " +
                                         s"type=$dbgType, " +
                                         s"text='${elemToks.map(_.origText).mkString(" ")}', " +
                                         s"indexes=${resIdxs.mkString("[", ",", "]")}, " +
                                         s"allTokensIndexes=${idxs.mkString("[", ",", "]")}, " +
                                         s"synonym=$syn" +
                                         s"]"
                                 )
                         }

                         // 1. SIMPLE.
                         if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(elemId) else !mdl.hasIdlSynonyms(elemId))) {
                             lazy val tokStems = toks.map(_.stem).mkString(" ")

                             // 1.1 Continuous.
                             var found = false

                             if (mdl.hasContinuousSynonyms)
                                 fastAccess(mdl.continuousSynonyms, elemId, toks.length) match {
                                     case Some(h) =>
                                         def tryMap(syns: Map[String, Synonym], notFound: () => Unit): Unit =
                                             syns.get(tokStems) match {
                                                 case Some(s) =>
                                                     found = true
                                                     add("simple continuous", toksExt, s)
                                                 case None => notFound()
                                             }

                                         def tryScan(syns: Seq[Synonym]): Unit =
                                             for (syn <- syns if !found)
                                                 NCSynonymsManager.onMatch(
                                                     ns.srvReqId,
                                                     elemId,
                                                     syn,
                                                     toks,
                                                     _ => {
                                                         found = true
                                                         add("simple continuous scan", toksExt, syn)
                                                     }
                                                 )

                                         tryMap(
                                             h.txtDirectSynonyms,
                                             () => {
                                                 tryScan(h.notTxtDirectSynonyms)

                                                 if (!found)
                                                     tryMap(h.txtNotDirectSynonyms, () => tryScan(h.notTxtNotDirectSynonyms))
                                             }
                                         )
                                     case None => // No-op.
                                 }

                             // 1.2 Sparse.
                             if (!found && mdl.hasSparseSynonyms)
                                 for (syn <- get(mdl.sparseSynonyms, elemId))
                                     NCSynonymsManager.onSparseMatch(
                                         ns.srvReqId,
                                         elemId,
                                         syn,
                                         toks,
                                         res => add("simple sparse", getSparsedTokens(res, toks), syn)
                                     )
                         }

                         // 2. IDL.
                         if (idlEnabled) {
                             val allSyns = get(mdl.idlSynonyms, elemId)
                             lazy val allCombs = mkCombinations(ch, toks)

                             // 2.1 Continuous.
                             if (!mdl.hasSparseSynonyms) {
                                 var found = false

                                 for (syn <- allSyns; comb <- allCombs;  if !found)
                                     NCSynonymsManager.onMatch(
                                         ns.srvReqId,
                                         elemId,
                                         syn,
                                         comb,
                                         req,
                                         variantsToks,
                                         _ => {
                                             val parts = toParts(mdl, ns.srvReqId, comb, syn)

                                             add("IDL continuous", toksExt, syn, parts)

                                             found = true
                                         }
                                     )
                             }
                             else
                                 // 2.2 Sparse.
                                 for (syn <- allSyns; comb <- allCombs)
                                     NCSynonymsManager.onSparseMatch(
                                         ns.srvReqId,
                                         elemId,
                                         syn,
                                         comb,
                                         req,
                                         variantsToks,
                                         res => {
                                             val toks = getSparsedTokens(toTokens(res, ns), toTokens(comb, ns))
                                             val parts = toParts(mdl, ns.srvReqId, res, syn)
                                             val typ = if (syn.sparse) "IDL sparse"else "IDL continuous"

                                             add(typ, toks, syn, parts)
                                         }
                                     )
                         }
                     }
                 }

             if (ns.firstProbePhase) {
                 ns.firstProbePhase = false

                 if (mdl.hasNoIdlSynonyms)
                     execute(simpleEnabled = true, idlEnabled = false)
                 execute(simpleEnabled = mdl.hasNoIdlSynonyms, idlEnabled = mdl.hasIdlSynonyms)
             }
             else if (mdl.hasIdlSynonyms)
                 execute(simpleEnabled = false, idlEnabled = true)

             processParsers(mdl, ns, span, req)
         }

         NCSynonymsManager.clearIteration(ns.srvReqId)

         normalize(ns)
     }

     /**
       *
       * @param ns
       */
     private def normalize(ns: Sentence): Unit = {
         val usrNotes = ns.flatten.filter(_.isUser).distinct
         val links = NCSentenceManager.getLinks(usrNotes)
         val parts = NCSentenceManager.getPartKeys(usrNotes)

         val usrNotesIdxs = usrNotes.
             filter(n => !links.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))).
             filter(n => !parts.contains(NCTokenPartKey(n, ns))).
             zipWithIndex

         usrNotesIdxs.
             foreach { case (n, idx) =>
                 usrNotesIdxs.find { case (candidate, candidateIdx) =>
                     candidateIdx != idx &&
                         candidate.noteType == n.noteType &&
                         candidate.dataOpt("parts") == n.dataOpt("parts") &&
                         candidate.wordIndexesSet.subsetOf(n.wordIndexesSet) &&
                         n.wordIndexes.filter(n => !candidate.wordIndexes.contains(n)).
                             forall(wordIdx => ns.tokens.exists(t => t.wordIndexes.contains(wordIdx) && t.isStopWord))
                 } match {
                     case Some(better) =>
                         ns.removeNote(n)

                         logger.trace(s"Element removed: $n, better: $better")
                     case None => // No-op.
                 }
             }
     }

     // TODO: simplify, add tests, check model properties (sparse etc) for optimization.
     /**
       *
       * @param elmId
       * @param toks
       * @param sliceToksIdxsSorted
       */
     private def alreadyMarked(ns: Sentence, elmId: String, toks: Seq[NlpToken], sliceToksIdxsSorted: Seq[Int]): Boolean = {
         lazy val toksIdxsSorted = toks.map(_.index).sorted

         sliceToksIdxsSorted.map(ns).forall(_.exists(n => n.noteType == elmId && n.sparsity == 0)) ||
         toks.exists(_.exists(n =>
             n.noteType == elmId &&
             (
                 (n.sparsity == 0 &&
                     (sliceToksIdxsSorted.containsSlice(n.tokenIndexes) || n.tokenIndexes.containsSlice(toksIdxsSorted))
                 )
                     ||
                 (
                     n.tokenIndexes == toksIdxsSorted ||
                     n.tokenIndexes.containsSlice(toksIdxsSorted) &&
                     U.isContinuous(toksIdxsSorted) &&
                     U.isContinuous(n.tokenIndexes)
                 )
             )
         ))
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* https://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model

	import io.opencensus.trace.Span
	import org.apache.nlpcraft.common._
	import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
	import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
	import org.apache.nlpcraft.model._
	import org.apache.nlpcraft.model.impl.NCTokenImpl
	import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind
	import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
	import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
	import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
	import org.apache.nlpcraft.probe.mgrs.synonyms.NCSynonymsManager
	import org.apache.nlpcraft.probe.mgrs.{NCProbeIdlToken => IdlToken, NCProbeModel, NCProbeVariants, NCTokenPartKey, NCProbeSynonym => Synonym}

	import java.io.Serializable
	import java.util.{List => JList}
	import scala.collection.mutable
	import scala.collection.parallel.CollectionConverters._
	import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, MapHasAsScala, SeqHasAsJava}

	/**
	* Model elements enricher.
	*/
	object NCModelEnricher extends NCProbeEnricher {
	type TokType = (NCToken, NCSynonymChunkKind)

	object IdlTokensSeq {
	def apply(all: Seq[IdlToken]): IdlTokensSeq = IdlTokensSeq(all.filter(_.isToken), all.flatMap(_.wordIndexes).toSet)
	}

	case class IdlTokensSeq(tokens: Seq[IdlToken], wordsIndexes: Set[Int]) {
	private val (idxsSet: Set[Int], minIndex: Int, maxIndex: Int) = {
	val seq = tokens.flatMap(_.wordIndexes).distinct.sorted

	(seq.toSet, seq.head, seq.last)
	}

	def isIntersect(minIndex: Int, maxIndex: Int, idxsSet: Set[Int]): Boolean =
	if (this.minIndex > maxIndex \|\| this.maxIndex < minIndex)
	false
	else
	this.idxsSet.exists(idxsSet.contains)

	override def toString: String = tokens.mkString(" \| ")
	}

	case class IdlTokensHolder(tokens: Seq[IdlToken], seqs: Seq[IdlTokensSeq])

	/**
	*
	* @param parent Optional parent span.
	* @return
	*/
	override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ =>
	ackStarting()
	ackStarted()
	}

	/**
	*
	* @param parent Optional parent span.
	*/
	override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ =>
	ackStopping()
	ackStopped()
	}

	def isComplex(mdl: NCProbeModel): Boolean = mdl.hasIdlSynonyms \|\| !mdl.model.getParsers.isEmpty

	/**
	*
	* @param ns
	* @param elemId
	* @param toks
	* @param direct
	* @param syn
	* @param parts
	* @param metaOpt
	*/
	private def mark(
	ns: Sentence,
	elemId: String,
	toks: Seq[NlpToken],
	direct: Boolean,
	syn: Option[Synonym] = None,
	parts: Seq[TokType] = Seq.empty,
	metaOpt: Option[Map[String, Object]] = None
	): Unit = {
	val params = mutable.ArrayBuffer.empty[(String, AnyRef)]

	// For system elements.
	params += "direct" -> direct.asInstanceOf[AnyRef]

	syn match {
	case Some(s) =>
	if (s.isValueSynonym)
	params += "value" -> s.value
	case None => // No-op.
	}

	metaOpt match {
	case Some(meta) => params += "meta" -> meta
	case None => // No-op.
	}

	if (parts.nonEmpty)
	params += "parts" -> parts.map { case (p, kind) => NCTokenPartKey(p, kind) }.asJava

	val idxs = toks.map(_.index).sorted

	val note = NlpNote(idxs, elemId, params: _*)

	toks.foreach(_.add(note))

	// For NLP elements.
	toks.foreach(t => ns.fixNote(t.getNlpNote, "direct" -> direct))
	}

	/**
	*
	* @param mdl
	* @param ns
	* @param span
	* @param req
	*/
	private def processParsers(mdl: NCProbeModel, ns: Sentence, span: Span, req: NCRequestImpl): Unit = {
	for (parser <- mdl.model.getParsers.asScala) {
	parser.onInit()

	startScopedSpan("customParser", span,
	"srvReqId" -> ns.srvReqId,
	"mdlId" -> mdl.model.getId,
	"txt" -> ns.text
	) { _ =>
	def to(t: NlpToken): NCCustomWord =
	new NCCustomWord {
	override def getNormalizedText: String = t.normText
	override def getOriginalText: String = t.origText
	override def getStartCharIndex: Int = t.startCharIndex
	override def getEndCharIndex: Int = t.endCharIndex
	override def getPos: String = t.pos
	override def getPosDescription: String = t.posDesc
	override def getLemma: String = t.lemma
	override def getStem: String = t.stem
	override def isStopWord: Boolean = t.isStopWord
	override def isBracketed: Boolean = t.isBracketed
	override def isQuoted: Boolean = t.isQuoted
	override def isKnownWord: Boolean = t.isKnownWord
	override def isSwearWord: Boolean = t.isSwearWord
	override def isEnglish: Boolean = t.isEnglish
	}

	val res = parser.parse(
	req,
	mdl.model,
	ns.map(to).asJava,
	ns.flatten.distinct.filter(!_.isNlp).map(n => {
	val noteId = n.noteType
	val words = ns.filter(t => n.tokenIndexes.contains(t.index)).map(to).asJava
	val md = n.asMetadata()

	new NCCustomElement() {
	override def getElementId: String = noteId
	override def getWords: JList[NCCustomWord] = words
	override def getMetadata: JavaMeta = md.map(p => p._1 -> p._2.asInstanceOf[AnyRef]).asJava
	}
	}).asJava
	)

	if (res != null)
	res.asScala.foreach(e => {
	val elmId = e.getElementId
	val words = e.getWords

	if (elmId == null)
	throw new NCE(s"Custom model parser cannot return 'null' element ID.")

	if (words == null \|\| words.isEmpty)
	throw new NCE(s"Custom model parser cannot return empty custom tokens for element: $elmId")

	val matchedToks = words.asScala.map(w =>
	ns.find(t =>
	t.startCharIndex == w.getStartCharIndex && t.endCharIndex == w.getEndCharIndex
	).getOrElse(throw new AssertionError(s"Custom model parser returned an invalid custom token: $w"))
	)

	// Checks element's tokens.
	if (!alreadyMarked(ns, elmId, matchedToks, matchedToks.map(_.index).sorted))
	mark(
	ns,
	elemId = elmId,
	toks = matchedToks,
	direct = true,
	metaOpt = Some(e.getMetadata.asScala.toMap)
	)
	})
	}

	parser.onDiscard()
	}
	}

	/**
	*
	* 1. Prepares combination of tokens (sliding).
	* Example: 'A B C D' -> {'A B C', 'A B', 'B C', 'A', 'B', 'C'}
	* One sentence converted to 4 pieces.
	*
	* 2. Additionally, each piece converted into set of elements with all possible its stopwords permutations.
	* Example: Piece: 'x1, x2(stopword), x3(stopword), x4' will be expanded into
	* {'x1, x2, x3, x4', 'x1, x2, x4', 'x1, x3, x4', 'x1, x4'}
	*
	* 3. All variants collected, duplicated deleted, etc.
	*
	* @param toks
	*/
	private def combosTokens(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], Seq[NlpToken])] =
	combos(toks).flatMap(combo => {
	val stops = combo.filter(s => s.isStopWord && s != combo.head && s != combo.last)

	val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[NlpToken]]

	for (stop <- stops)
	if (slides.nonEmpty && slides.last.last.index + 1 == stop.index)
	slides.last += stop
	else
	slides += mutable.ArrayBuffer.empty :+ stop

	// Too many stopords inside skipped.
	val bigSlides = slides.filter(_.size > 2)

	var stops4Delete: Seq[Seq[NlpToken]] =
	if (bigSlides.nonEmpty) {
	val allBig = bigSlides.flatten
	val stops4AllCombs = stops.filter(p => !allBig.contains(p))

	if (stops4AllCombs.nonEmpty)
	for (
	seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
	seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)
	)
	yield seq1 ++ seq2.flatten
	else
	for (seq <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations))
	yield seq.toSeq.flatten
	}
	else
	Range.inclusive(1, stops.size).flatMap(stops.combinations)

	stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last))

	(Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct

	}).
	filter(_._1.nonEmpty).
	groupBy(_._1).
	map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))).
	sortBy { case(data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) }

	/**
	*
	* @param toks
	*/
	private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
	(for (n <- toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p => p)

	/**
	*
	* @param seq
	* @param s
	*/
	private def toParts(mdl: NCProbeModel, stvReqId: String, seq: Seq[IdlToken], s: Synonym): Seq[TokType] =
	seq.zip(s.map(_.kind)).flatMap {
	case (idlTok, kind) =>
	val t = if (idlTok.isToken) idlTok.token else mkNlpToken(mdl, stvReqId, idlTok.word)

	Some(t -> kind)
	}

	/**
	*
	* @param tows
	* @param ns
	*/
	private def toTokens(tows: Seq[IdlToken], ns: Sentence): Seq[NlpToken] =
	(
	tows.filter(_.isWord).map(_.word) ++
	tows.filter(_.isToken).map(_.token).
	flatMap(w => ns.filter(t => t.wordIndexes.intersect(w.wordIndexes).nonEmpty))
	).sortBy(_.startCharIndex)

	/**
	*
	* @param m
	* @param id
	*/
	private def get(m: Map[String , Seq[Synonym]], id: String): Seq[Synonym] = m.getOrElse(id, Seq.empty)

	/**
	* Gets synonyms sorted in descending order by their weight (already prepared),
	* i.e. first synonym in the sequence is the most important one.
	*
	* @param fastMap {Element ID -> {Synonym length -> T}}
	* @param elmId
	* @param len
	*/
	private def fastAccess[T](fastMap: Map[String, Map[Int, T]], elmId: String, len: Int): Option[T] =
	fastMap.getOrElse(elmId, Map.empty[Int, T]).get(len)

	/**
	*
	* @param mdl
	* @param ns
	*/
	private def mkHolder(mdl: NCProbeModel, ns: Sentence): IdlTokensHolder = {
	val toks = ns.map(IdlToken(_))

	val seqs =
	NCProbeVariants.convert(ns.srvReqId, mdl, NCSentenceManager.collapse(mdl.model, ns.clone())).
	map(_.asScala).
	par.
	flatMap(sen =>
	// Tokens splitting.
	// For example sentence "A B С D E" (5 words) processed as 3 tokens on first phase after collapsing
	// 'A B' (2 words), 'C D' (2 words) and 'E' (1 word)
	// So, result combinations will be:
	// Token(AB) + Token(CD) + Token(E)
	// Token(AB) + Word(C) + Word(D) + Token(E)
	// Word(A) + Word(B) + Token(CD) + Token(E)
	// Word(A) + Word(B) + Word(C) + Word(D) + Token(E)
	combos(sen).
	map(senPartComb => {
	sen.flatMap(t =>
	// Single word token is not split as words - token.
	// Partly (not strict in) token - word.
	if (t.wordIndexes.length == 1 \|\| senPartComb.contains(t))
	Seq(IdlToken(t))
	else
	t.wordIndexes.map(toks)
	)
	// Drops without tokens (IDL part works with tokens).
	}).filter(_.exists(_.isToken)).map(IdlTokensSeq(_)).distinct
	).seq

	IdlTokensHolder(toks, seqs)
	}

	/**
	*
	* @param mdl
	* @param srvReqId
	* @param t
	*/
	private def mkNlpToken(mdl: NCProbeModel, srvReqId: String, t: NlpToken): NCToken = {
	val notes = mutable.HashSet.empty[NlpNote]

	notes += t.getNlpNote

	NCTokenImpl(mdl, srvReqId, NlpToken(t.index, notes, t.stopsReasons))
	}

	/**
	*
	* @param h
	* @param toks
	*/
	private def mkCombinations(h: IdlTokensHolder, toks: Seq[NlpToken]): Seq[Seq[IdlToken]] = {
	val idxs = toks.flatMap(_.wordIndexes).toSet

	h.seqs.par.
	flatMap(seq => {
	val rec = seq.tokens.filter(_.wordIndexes.exists(idxs.contains))

	// Drops without tokens (IDL part works with tokens).
	if (rec.nonEmpty)
	Some(rec ++
	(seq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.tokens)
	)
	else
	None
	}).seq
	}

	/**
	*
	* @param matched
	* @param toks2Match
	*/
	private def getSparsedTokens(matched: Seq[NlpToken], toks2Match: Seq[NlpToken]): Seq[NlpToken] = {
	require(matched.nonEmpty)

	// Matched tokens should be already sorted.
	val stopsInside = toks2Match.filter(t =>
	t.isStopWord && !matched.contains(matched) && t.index > matched.head.index && t.index < matched.last.index
	)

	if (stopsInside.nonEmpty) (matched ++ stopsInside).sortBy(_.index) else matched
	}

	@throws[NCE]
	override def enrich(mdl: NCProbeModel, ns: Sentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = {
	require(isStarted)

	startScopedSpan(
	"enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
	) { span =>
	val req = NCRequestImpl(senMeta, ns.srvReqId)

	lazy val ch = mkHolder(mdl, ns)
	lazy val variantsToks =
	ch.seqs.map(
	p => p.tokens.map(p => if (p.isToken) p.token else mkNlpToken(mdl, ns.srvReqId, p.word))
	)

	def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
	startScopedSpan(
	"execute", span, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
	) { _ =>
	if (DEEP_DEBUG)
	logger.trace(s"Execution started [simpleEnabled=$simpleEnabled, idlEnabled=$idlEnabled]")

	for (
	// 'toksExt' is piece of sentence, 'toks' is the same as 'toksExt' or without some stopwords set.
	(toks, toksExt) <- combosTokens(ns.toSeq);
	idxs = toks.map(_.index);
	e <- mdl.elements.values;
	elemId = e.getId;
	greedy = e.isGreedy.orElse(mdl.model.isGreedy)
	if !greedy \|\| !alreadyMarked(ns, elemId, toks, idxs)
	) {
	def add(
	dbgType: String,
	elemToks: Seq[NlpToken],
	syn: Synonym,
	parts: Seq[TokType] = Seq.empty
	): Unit = {
	val resIdxs = elemToks.map(_.index)

	val ok =
	(!greedy \|\| !alreadyMarked(ns, elemId, elemToks, idxs)) &&
	( parts.isEmpty \|\| !parts.exists { case (t, _) => t.getId == elemId })

	if (ok)
	mark(
	ns,
	elemId,
	elemToks,
	direct = syn.isDirect && U.isIncreased(resIdxs),
	syn = Some(syn),
	parts = parts
	)

	if (DEEP_DEBUG)
	logger.trace(
	s"${if (ok) "Added" else "Skipped"} element [" +
	s"id=$elemId, " +
	s"type=$dbgType, " +
	s"text='${elemToks.map(_.origText).mkString(" ")}', " +
	s"indexes=${resIdxs.mkString("[", ",", "]")}, " +
	s"allTokensIndexes=${idxs.mkString("[", ",", "]")}, " +
	s"synonym=$syn" +
	s"]"
	)
	}

	// 1. SIMPLE.
	if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(elemId) else !mdl.hasIdlSynonyms(elemId))) {
	lazy val tokStems = toks.map(_.stem).mkString(" ")

	// 1.1 Continuous.
	var found = false

	if (mdl.hasContinuousSynonyms)
	fastAccess(mdl.continuousSynonyms, elemId, toks.length) match {
	case Some(h) =>
	def tryMap(syns: Map[String, Synonym], notFound: () => Unit): Unit =
	syns.get(tokStems) match {
	case Some(s) =>
	found = true
	add("simple continuous", toksExt, s)
	case None => notFound()
	}

	def tryScan(syns: Seq[Synonym]): Unit =
	for (syn <- syns if !found)
	NCSynonymsManager.onMatch(
	ns.srvReqId,
	elemId,
	syn,
	toks,
	_ => {
	found = true
	add("simple continuous scan", toksExt, syn)
	}
	)

	tryMap(
	h.txtDirectSynonyms,
	() => {
	tryScan(h.notTxtDirectSynonyms)

	if (!found)
	tryMap(h.txtNotDirectSynonyms, () => tryScan(h.notTxtNotDirectSynonyms))
	}
	)
	case None => // No-op.
	}

	// 1.2 Sparse.
	if (!found && mdl.hasSparseSynonyms)
	for (syn <- get(mdl.sparseSynonyms, elemId))
	NCSynonymsManager.onSparseMatch(
	ns.srvReqId,
	elemId,
	syn,
	toks,
	res => add("simple sparse", getSparsedTokens(res, toks), syn)
	)
	}

	// 2. IDL.
	if (idlEnabled) {
	val allSyns = get(mdl.idlSynonyms, elemId)
	lazy val allCombs = mkCombinations(ch, toks)

	// 2.1 Continuous.
	if (!mdl.hasSparseSynonyms) {
	var found = false

	for (syn <- allSyns; comb <- allCombs; if !found)
	NCSynonymsManager.onMatch(
	ns.srvReqId,
	elemId,
	syn,
	comb,
	req,
	variantsToks,
	_ => {
	val parts = toParts(mdl, ns.srvReqId, comb, syn)

	add("IDL continuous", toksExt, syn, parts)

	found = true
	}
	)
	}
	else
	// 2.2 Sparse.
	for (syn <- allSyns; comb <- allCombs)
	NCSynonymsManager.onSparseMatch(
	ns.srvReqId,
	elemId,
	syn,
	comb,
	req,
	variantsToks,
	res => {
	val toks = getSparsedTokens(toTokens(res, ns), toTokens(comb, ns))
	val parts = toParts(mdl, ns.srvReqId, res, syn)
	val typ = if (syn.sparse) "IDL sparse"else "IDL continuous"

	add(typ, toks, syn, parts)
	}
	)
	}
	}
	}

	if (ns.firstProbePhase) {
	ns.firstProbePhase = false

	if (mdl.hasNoIdlSynonyms)
	execute(simpleEnabled = true, idlEnabled = false)
	execute(simpleEnabled = mdl.hasNoIdlSynonyms, idlEnabled = mdl.hasIdlSynonyms)
	}
	else if (mdl.hasIdlSynonyms)
	execute(simpleEnabled = false, idlEnabled = true)

	processParsers(mdl, ns, span, req)
	}

	NCSynonymsManager.clearIteration(ns.srvReqId)

	normalize(ns)
	}

	/**
	*
	* @param ns
	*/
	private def normalize(ns: Sentence): Unit = {
	val usrNotes = ns.flatten.filter(_.isUser).distinct
	val links = NCSentenceManager.getLinks(usrNotes)
	val parts = NCSentenceManager.getPartKeys(usrNotes)

	val usrNotesIdxs = usrNotes.
	filter(n => !links.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))).
	filter(n => !parts.contains(NCTokenPartKey(n, ns))).
	zipWithIndex

	usrNotesIdxs.
	foreach { case (n, idx) =>
	usrNotesIdxs.find { case (candidate, candidateIdx) =>
	candidateIdx != idx &&
	candidate.noteType == n.noteType &&
	candidate.dataOpt("parts") == n.dataOpt("parts") &&
	candidate.wordIndexesSet.subsetOf(n.wordIndexesSet) &&
	n.wordIndexes.filter(n => !candidate.wordIndexes.contains(n)).
	forall(wordIdx => ns.tokens.exists(t => t.wordIndexes.contains(wordIdx) && t.isStopWord))
	} match {
	case Some(better) =>
	ns.removeNote(n)

	logger.trace(s"Element removed: $n, better: $better")
	case None => // No-op.
	}
	}
	}

	// TODO: simplify, add tests, check model properties (sparse etc) for optimization.
	/**
	*
	* @param elmId
	* @param toks
	* @param sliceToksIdxsSorted
	*/
	private def alreadyMarked(ns: Sentence, elmId: String, toks: Seq[NlpToken], sliceToksIdxsSorted: Seq[Int]): Boolean = {
	lazy val toksIdxsSorted = toks.map(_.index).sorted

	sliceToksIdxsSorted.map(ns).forall(_.exists(n => n.noteType == elmId && n.sparsity == 0)) \|\|
	toks.exists(_.exists(n =>
	n.noteType == elmId &&
	(
	(n.sparsity == 0 &&
	(sliceToksIdxsSorted.containsSlice(n.tokenIndexes) \|\| n.tokenIndexes.containsSlice(toksIdxsSorted))
	)
	\|\|
	(
	n.tokenIndexes == toksIdxsSorted \|\|
	n.tokenIndexes.containsSlice(toksIdxsSorted) &&
	U.isContinuous(toksIdxsSorted) &&
	U.isContinuous(n.tokenIndexes)
	)
	)
	))
	}
	}