| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.nlpcraft.probe.mgrs.sentence |
| |
| import io.opencensus.trace.Span |
| import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink |
| import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank |
| import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken} |
| import org.apache.nlpcraft.common.{NCE, NCService, U} |
| import org.apache.nlpcraft.model.NCModel |
| |
| import java.io.{Serializable ⇒ JSerializable} |
| import java.util |
| import java.util.{List ⇒ JList} |
| import scala.collection.JavaConverters.{asScalaBufferConverter, _} |
| import scala.collection.{Map, Seq, mutable} |
| import scala.language.implicitConversions |
| |
| /** |
| * Sentences processing manager. |
| */ |
| object NCSentenceManager extends NCService { |
| @volatile private var pool: java.util.concurrent.ForkJoinPool = _ |
| |
| case class PartKey(id: String, start: Int, end: Int) { |
| require(start <= end) |
| |
| private def in(i: Int): Boolean = i >= start && i <= end |
| def intersect(id: String, start: Int, end: Int): Boolean = id == this.id && (in(start) || in(end)) |
| } |
| |
| object PartKey { |
| def apply(m: util.HashMap[String, JSerializable]): PartKey = { |
| def get[T](name: String): T = m.get(name).asInstanceOf[T] |
| |
| PartKey(get("id"), get("startcharindex"), get("endcharindex")) |
| } |
| |
| def apply(t: NCNlpSentenceNote, sen: NCNlpSentence): PartKey = |
| PartKey(t.noteType, sen(t.tokenFrom).startCharIndex, sen(t.tokenTo).endCharIndex) |
| } |
| |
| /** |
| * |
| * @param notes |
| */ |
| private def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = { |
| val noteLinks = mutable.ArrayBuffer.empty[NoteLink] |
| |
| for (n ← notes.filter(n ⇒ n.noteType == "nlpcraft:limit" || n.noteType == "nlpcraft:references")) |
| noteLinks += NoteLink(n("note").asInstanceOf[String], n("indexes").asInstanceOf[JList[Int]].asScala.sorted) |
| |
| for (n ← notes.filter(_.noteType == "nlpcraft:sort")) { |
| def add(noteName: String, idxsName: String): Unit = { |
| val names = n(noteName).asInstanceOf[JList[String]] |
| val idxsSeq = n(idxsName).asInstanceOf[JList[JList[Int]]] |
| |
| require(names.size() == idxsSeq.size()) |
| |
| noteLinks ++= |
| (for ((name, idxs) ← names.asScala.zip(idxsSeq.asScala.map(_.asScala))) |
| yield NoteLink(name, idxs.sorted) |
| ) |
| } |
| |
| if (n.contains("subjnotes")) add("subjnotes", "subjindexes") |
| if (n.contains("bynotes")) add("bynotes", "byindexes") |
| } |
| |
| noteLinks |
| } |
| |
| /** |
| * |
| * @param notes |
| */ |
| private def getPartKeys(notes: NCNlpSentenceNote*): Seq[PartKey] = |
| notes. |
| filter(_.isUser). |
| flatMap(n ⇒ { |
| val optList: Option[JList[util.HashMap[String, JSerializable]]] = n.dataOpt("parts") |
| |
| optList |
| }).flatMap(_.asScala).map(m ⇒ PartKey(m)).distinct |
| |
| /** |
| * |
| * @param ns |
| * @param idxs |
| * @param notesType |
| * @param note |
| * @return |
| */ |
| private def checkRelation(ns: NCNlpSentence, idxs: Seq[Int], notesType: String, note: NCNlpSentenceNote): Boolean = { |
| val types = idxs.flatMap(idx ⇒ ns(idx).map(p ⇒ p).filter(!_.isNlp).map(_.noteType)).distinct |
| |
| /** |
| * Example: |
| * 1. Sentence 'maximum x' (single element related function) |
| * - maximum is aggregate function linked to date element. |
| * - x defined as 2 elements: date and num. |
| * So, the variant 'maximum x (as num)' should be excluded. |
| * * |
| * 2. Sentence 'compare x and y' (multiple elements related function) |
| * - compare is relation function linked to date element. |
| * - x an y defined as 2 elements: date and num. |
| * So, variants 'x (as num) and x (as date)' and 'x (as date) and x (as num)' |
| * should not be excluded, but invalid relation should be deleted for these combinations. |
| */ |
| types.size match { |
| case 0 ⇒ false |
| case 1 ⇒ types.head == notesType |
| case _ ⇒ |
| // Equal elements should be processed together with function element. |
| if (types.size == 1) |
| false |
| else { |
| ns.removeNote(note) |
| |
| logger.trace(s"Removed note: $note") |
| |
| true |
| } |
| } |
| } |
| |
| /** |
| * Fixes notes with references to other notes indexes. |
| * Note that 'idxsField' is 'indexes' and 'noteField' is 'note' for all kind of references. |
| * |
| * @param noteType Note type. |
| * @param idxsField Indexes field. |
| * @param noteField Note field. |
| * @param ns Sentence. |
| * @param history Indexes transformation history. |
| * @return Valid flag. |
| */ |
| private def fixIndexesReferences( |
| noteType: String, |
| idxsField: String, |
| noteField: String, |
| ns: NCNlpSentence, |
| history: Seq[(Int, Int)] |
| ): Boolean = { |
| ns.filter(_.isTypeOf(noteType)).foreach(tok ⇒ |
| tok.getNoteOpt(noteType, idxsField) match { |
| case Some(n) ⇒ |
| val idxs: Seq[Int] = n.data[JList[Int]](idxsField).asScala |
| var fixed = idxs |
| |
| history.foreach { case (idxOld, idxNew) ⇒ fixed = fixed.map(i ⇒ if (i == idxOld) idxNew else i) } |
| |
| fixed = fixed.distinct |
| |
| if (idxs != fixed) |
| ns.fixNote(n, "indexes" → fixed.asJava.asInstanceOf[JSerializable]) |
| case None ⇒ // No-op. |
| } |
| ) |
| |
| ns.flatMap(_.getNotes(noteType)).forall( |
| n ⇒ checkRelation(ns, n.data[JList[Int]]("indexes").asScala, n.data[String](noteField), n) |
| ) |
| } |
| |
| /** |
| * |
| * @param note |
| * @param idxsField |
| * @param noteField |
| * @param ns |
| */ |
| private def fixNoteIndexes(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit = |
| ns.flatMap(_.getNotes(note)).foreach( |
| n ⇒ checkRelation(ns, n.data[JList[Int]](idxsField).asScala, n.data[String](noteField), n) |
| ) |
| |
| /** |
| * |
| * @param note |
| * @param idxsField |
| * @param noteField |
| * @param ns |
| */ |
| private def fixNoteIndexesList(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit = { |
| ns.flatMap(_.getNotes(note)).foreach(rel ⇒ |
| rel.dataOpt[JList[JList[Int]]](idxsField) match { |
| case Some(idxsList) ⇒ |
| val notesTypes = rel.data[JList[String]](noteField) |
| |
| require(idxsList.size() == notesTypes.size()) |
| |
| idxsList.asScala.zip(notesTypes.asScala).foreach { |
| case (idxs, notesType) ⇒ checkRelation(ns, idxs.asScala, notesType, rel) |
| } |
| case None ⇒ // No-op. |
| } |
| ) |
| } |
| |
| /** |
| * Copies token. |
| * |
| * @param ns Sentence. |
| * @param history Indexes transformation history. |
| * @param toksCopy Copied tokens. |
| * @param i Index. |
| */ |
| private def simpleCopy( |
| ns: NCNlpSentence, |
| history: mutable.ArrayBuffer[(Int, Int)], |
| toksCopy: NCNlpSentence, i: Int |
| ): Seq[NCNlpSentenceToken] = { |
| val tokCopy = toksCopy(i) |
| |
| history += tokCopy.index → ns.size |
| |
| ns += tokCopy.clone(ns.size) |
| } |
| |
| /** |
| * Glues stop words. |
| * |
| * @param ns Sentence. |
| * @param userNoteTypes Notes types. |
| * @param history Indexes transformation history. |
| */ |
| private def unionStops( |
| ns: NCNlpSentence, |
| userNoteTypes: Seq[String], |
| history: mutable.ArrayBuffer[(Int, Int)] |
| ): Unit = { |
| // Java collection used because using scala collections (mutable.Buffer.empty[mutable.Buffer[Token]]) is reason |
| // Of compilation errors which seems as scala compiler internal error. |
| val bufs = new util.ArrayList[mutable.Buffer[NCNlpSentenceToken]]() |
| |
| def last[T](l: JList[T]): T = l.get(l.size() - 1) |
| |
| ns.filter(t ⇒ t.isStopWord && !t.isBracketed).foreach(t ⇒ |
| if (!bufs.isEmpty && last(bufs).last.index + 1 == t.index) |
| last(bufs) += t |
| else |
| bufs.add(mutable.Buffer.empty[NCNlpSentenceToken] :+ t) |
| ) |
| |
| val idxsSeq = bufs.asScala.filter(_.lengthCompare(1) > 0).map(_.map(_.index)) |
| |
| if (idxsSeq.nonEmpty) { |
| val nsCopyToks = ns.clone() |
| ns.clear() |
| |
| val buf = mutable.Buffer.empty[Int] |
| |
| for (i ← nsCopyToks.indices) |
| idxsSeq.find(_.contains(i)) match { |
| case Some(idxs) ⇒ |
| if (!buf.contains(idxs.head)) { |
| buf += idxs.head |
| |
| ns += mkCompound(ns, nsCopyToks, idxs, stop = true, ns.size, None, history) |
| } |
| case None ⇒ simpleCopy(ns, history, nsCopyToks, i) |
| } |
| |
| fixIndexes(ns, userNoteTypes) |
| } |
| } |
| |
| /** |
| * Fixes indexes for all notes after recreating tokens. |
| * |
| * @param ns Sentence. |
| * @param userNoteTypes Notes types. |
| */ |
| private def fixIndexes(ns: NCNlpSentence, userNoteTypes: Seq[String]) { |
| // Replaces other notes indexes. |
| for (t ← userNoteTypes :+ "nlpcraft:nlp"; note ← ns.getNotes(t)) { |
| val toks = ns.filter(_.contains(note)).sortBy(_.index) |
| |
| val newNote = note.clone(toks.map(_.index), toks.flatMap(_.wordIndexes).sorted) |
| |
| toks.foreach(t ⇒ { |
| t.remove(note) |
| t.add(newNote) |
| }) |
| } |
| |
| // Special case - field index of core NLP note. |
| ns.zipWithIndex.foreach { case (tok, idx) ⇒ ns.fixNote(tok.getNlpNote, "index" → idx) } |
| } |
| |
| /** |
| * Zip notes with same type. |
| * |
| * @param ns Sentence. |
| * @param nType Notes type. |
| * @param userNotesTypes Notes types. |
| * @param history Indexes transformation history. |
| */ |
| private def zipNotes( |
| ns: NCNlpSentence, |
| nType: String, |
| userNotesTypes: Seq[String], |
| history: mutable.ArrayBuffer[(Int, Int)] |
| ): Unit = { |
| val nts = ns.getNotes(nType).filter(n ⇒ n.tokenFrom != n.tokenTo).sortBy(_.tokenFrom) |
| |
| val overlapped = |
| nts.flatMap(n ⇒ n.tokenFrom to n.tokenTo).map(ns(_)).exists( |
| t ⇒ userNotesTypes.map(pt ⇒ t.getNotes(pt).size).sum > 1 |
| ) |
| |
| if (nts.nonEmpty && !overlapped) { |
| val nsCopyToks = ns.clone() |
| ns.clear() |
| |
| val buf = mutable.ArrayBuffer.empty[Int] |
| |
| for (i ← nsCopyToks.indices) |
| nts.find(_.tokenIndexes.contains(i)) match { |
| case Some(n) ⇒ |
| if (!buf.contains(n.tokenFrom)) { |
| buf += n.tokenFrom |
| |
| ns += mkCompound(ns, nsCopyToks, n.tokenIndexes, stop = false, ns.size, Some(n), history) |
| } |
| case None ⇒ simpleCopy(ns, history, nsCopyToks, i) |
| } |
| |
| fixIndexes(ns, userNotesTypes) |
| } |
| } |
| |
| /** |
| * Makes compound note. |
| * |
| * @param ns Sentence. |
| * @param nsCopyToks Tokens. |
| * @param indexes Indexes. |
| * @param stop Flag. |
| * @param idx Index. |
| * @param commonNote Common note. |
| * @param history Indexes transformation history. |
| */ |
| private def mkCompound( |
| ns: NCNlpSentence, |
| nsCopyToks: Seq[NCNlpSentenceToken], |
| indexes: Seq[Int], |
| stop: Boolean, |
| idx: Int, |
| commonNote: Option[NCNlpSentenceNote], |
| history: mutable.ArrayBuffer[(Int, Int)] |
| ): NCNlpSentenceToken = { |
| val t = NCNlpSentenceToken(idx) |
| |
| // Note, it adds stop-words too. |
| val content = nsCopyToks.zipWithIndex.filter(p ⇒ indexes.contains(p._2)).map(_._1) |
| |
| content.foreach(t ⇒ history += t.index → idx) |
| |
| def mkValue(get: NCNlpSentenceToken ⇒ String): String = { |
| val buf = mutable.Buffer.empty[String] |
| |
| val n = content.size - 1 |
| |
| content.zipWithIndex.foreach(p ⇒ { |
| val t = p._1 |
| val idx = p._2 |
| |
| buf += get(t) |
| |
| if (idx < n && t.endCharIndex != content(idx + 1).startCharIndex) |
| buf += " " |
| }) |
| |
| buf.mkString |
| } |
| |
| val origText = mkValue((t: NCNlpSentenceToken) ⇒ t.origText) |
| |
| val idxs = Seq(idx) |
| val wordIdxs = content.flatMap(_.wordIndexes).sorted |
| |
| val direct = |
| commonNote match { |
| case Some(n) if n.isUser ⇒ n.isDirect |
| case _ ⇒ content.forall(_.isDirect) |
| } |
| |
| val params = Seq( |
| "index" → idx, |
| "pos" → NCPennTreebank.SYNTH_POS, |
| "posDesc" → NCPennTreebank.SYNTH_POS_DESC, |
| "lemma" → mkValue((t: NCNlpSentenceToken) ⇒ t.lemma), |
| "origText" → origText, |
| "normText" → mkValue((t: NCNlpSentenceToken) ⇒ t.normText), |
| "stem" → mkValue((t: NCNlpSentenceToken) ⇒ t.stem), |
| "start" → content.head.startCharIndex, |
| "end" → content.last.endCharIndex, |
| "charLength" → origText.length, |
| "quoted" → false, |
| "stopWord" → stop, |
| "bracketed" → false, |
| "direct" → direct, |
| "dict" → (if (nsCopyToks.size == 1) nsCopyToks.head.getNlpNote.data[Boolean]("dict") else false), |
| "english" → nsCopyToks.forall(_.getNlpNote.data[Boolean]("english")), |
| "swear" → nsCopyToks.exists(_.getNlpNote.data[Boolean]("swear")) |
| ) |
| |
| val nlpNote = NCNlpSentenceNote(idxs, wordIdxs, "nlpcraft:nlp", params: _*) |
| |
| t.add(nlpNote) |
| |
| // Adds processed note with fixed indexes. |
| commonNote match { |
| case Some(n) ⇒ |
| ns.removeNote(n) |
| t.add(n.clone(idxs, wordIdxs)) |
| case None ⇒ // No-op. |
| } |
| |
| t |
| } |
| |
| /** |
| * Fixes notes with references list to other notes indexes. |
| * |
| * @param noteType Note type. |
| * @param idxsField Indexes field. |
| * @param noteField Note field. |
| * @param ns Sentence. |
| * @param history Indexes transformation history. |
| * @return Valid flag. |
| */ |
| private def fixIndexesReferencesList( |
| noteType: String, |
| idxsField: String, |
| noteField: String, |
| ns: NCNlpSentence, |
| history: Seq[(Int, Int)] |
| ): Boolean = { |
| var ok = true |
| |
| for (tok ← ns.filter(_.isTypeOf(noteType)) if ok) |
| tok.getNoteOpt(noteType, idxsField) match { |
| case Some(n) ⇒ |
| val idxs: Seq[Seq[Int]] = |
| n.data[JList[JList[Int]]](idxsField).asScala.map(_.asScala) |
| var fixed = idxs |
| |
| history.foreach { |
| case (idxOld, idxNew) ⇒ fixed = fixed.map(_.map(i ⇒ if (i == idxOld) idxNew else i).distinct) |
| } |
| |
| if (fixed.forall(_.size == 1)) |
| // Fix double dimension array to one dimension, |
| // so it should be called always in spite of 'fixIndexesReferences' method. |
| ns.fixNote(n, idxsField → fixed.map(_.head).asJava.asInstanceOf[JSerializable]) |
| else |
| ok = false |
| case None ⇒ // No-op. |
| } |
| |
| ok && |
| ns.flatMap(_.getNotes(noteType)).forall(rel ⇒ |
| rel.dataOpt[JList[Int]](idxsField) match { |
| case Some(idxsList) ⇒ |
| val notesTypes = rel.data[JList[String]](noteField) |
| |
| require(idxsList.size() == notesTypes.size()) |
| |
| idxsList.asScala.zip(notesTypes.asScala).forall { |
| case (idxs, notesType) ⇒ checkRelation(ns, Seq(idxs), notesType, rel) |
| } |
| case None ⇒ true |
| } |
| ) |
| } |
| |
| /** |
| * Fixes tokens positions. |
| * |
| * @param ns Sentence. |
| * @param notNlpTypes Token types. |
| */ |
| private def collapseSentence(ns: NCNlpSentence, notNlpTypes: Seq[String]): Boolean = { |
| ns. |
| filter(!_.isNlp). |
| filter(_.isStopWord). |
| flatten. |
| filter(_.isNlp). |
| foreach(n ⇒ ns.fixNote(n, "stopWord" → false)) |
| |
| val all = ns.tokens.flatten |
| val nsNotes: Map[String, Seq[Int]] = all.map(p ⇒ p.noteType → p.tokenIndexes).toMap |
| |
| for ( |
| t ← ns.tokens; stopReason ← t.stopsReasons |
| if all.contains(stopReason) && nsNotes.getOrElse(stopReason.noteType, Seq.empty) == stopReason.tokenIndexes |
| ) |
| ns.fixNote(t.getNlpNote, "stopWord" → true) |
| |
| val history = mutable.ArrayBuffer.empty[(Int, Int)] |
| |
| fixNoteIndexes("nlpcraft:relation", "indexes", "note", ns) |
| fixNoteIndexes("nlpcraft:limit", "indexes", "note", ns) |
| fixNoteIndexesList("nlpcraft:sort", "subjindexes", "subjnotes", ns) |
| fixNoteIndexesList("nlpcraft:sort", "byindexes", "bynotes", ns) |
| |
| notNlpTypes.foreach(typ ⇒ zipNotes(ns, typ, notNlpTypes, history)) |
| unionStops(ns, notNlpTypes, history) |
| |
| val res = |
| fixIndexesReferences("nlpcraft:relation", "indexes", "note", ns, history) && |
| fixIndexesReferences("nlpcraft:limit", "indexes", "note", ns, history) && |
| fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) && |
| fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history) |
| |
| if (res) { |
| // Validation (all indexes calculated well) |
| require( |
| !res || |
| !ns.flatten. |
| exists(n ⇒ ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t ⇒ !t.contains(n))), |
| s"Invalid sentence:\n" + |
| ns.map(t ⇒ |
| // Human readable invalid sentence for debugging. |
| s"${t.origText}{index:${t.index}}[${t.map(n ⇒ s"${n.noteType}, {range:${n.tokenFrom}-${n.tokenTo}}").mkString("|")}]" |
| ).mkString("\n") |
| ) |
| } |
| |
| res |
| } |
| |
| /** |
| * |
| * @param mdl |
| * @param ns |
| */ |
| private def dropAbstract(mdl: NCModel, ns: NCNlpSentence): Unit = |
| if (!mdl.getAbstractTokens.isEmpty) { |
| val notes = ns.flatten |
| |
| val keys = getPartKeys(notes: _*) |
| val noteLinks = getLinks(notes) |
| |
| notes.filter(n ⇒ { |
| val noteToks = ns.tokens.filter(_.contains(n)) |
| |
| mdl.getAbstractTokens.contains(n.noteType) && |
| !keys.exists(_.intersect(n.noteType, noteToks.head.startCharIndex, noteToks.last.startCharIndex)) && |
| !noteLinks.contains(NoteLink(n.noteType, n.tokenIndexes.sorted)) |
| }).foreach(ns.removeNote) |
| } |
| |
| /** |
| * |
| * @param toks |
| * @return |
| */ |
| private def getNotNlpNotes(toks: Seq[NCNlpSentenceToken]): Seq[NCNlpSentenceNote] = |
| toks.flatten.filter(!_.isNlp).distinct |
| |
| /** |
| * |
| * @param thisSen |
| * @param sen |
| * @param dels |
| */ |
| private def addDeleted(thisSen: NCNlpSentence, sen: NCNlpSentence, dels: Iterable[NCNlpSentenceNote]): Unit = |
| sen.addDeletedNotes(dels.map(n ⇒ { |
| val savedDelNote = n.clone() |
| val savedDelToks = n.tokenIndexes.map(idx ⇒ thisSen(idx).clone()) |
| |
| val mainNotes = savedDelToks.flatten.filter(n ⇒ n.noteType != "nlpcraft:nlp" && n != savedDelNote) |
| |
| // Deleted note's tokens should contains only nlp data and deleted notes. |
| for (savedDelTok ← savedDelToks; mainNote ← mainNotes) |
| savedDelTok.remove(mainNote) |
| |
| savedDelNote → savedDelToks |
| }).toMap) |
| |
| /** |
| * This collapser handles several tasks: |
| * - "overall" collapsing after all other individual collapsers had their turn. |
| * - Special further enrichment of tokens like linking, etc. |
| * |
| * In all cases of overlap (full or partial) - the "longest" note wins. In case of overlap and equal |
| * lengths - the winning note is chosen based on this priority. |
| */ |
| @throws[NCE] |
| private def collapseSentence(sen: NCNlpSentence, mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = { |
| def collapse0(ns: NCNlpSentence): Option[NCNlpSentence] = { |
| if (lastPhase) |
| dropAbstract(mdl, ns) |
| |
| if (collapseSentence(ns, getNotNlpNotes(ns).map(_.noteType).distinct)) Some(ns) else None |
| } |
| |
| // Always deletes `similar` notes. |
| // Some words with same note type can be detected various ways. |
| // We keep only one variant - with `best` direct and sparsity parameters, |
| // other variants for these words are redundant. |
| val redundant: Seq[NCNlpSentenceNote] = |
| sen.flatten.filter(!_.isNlp).distinct. |
| groupBy(_.getKey()). |
| map(p ⇒ p._2.sortBy(p ⇒ |
| ( |
| // System notes don't have such flags. |
| if (p.isUser) { |
| if (p.isDirect) |
| 0 |
| else |
| 1 |
| } |
| else |
| 0, |
| if (p.isUser) |
| p.sparsity |
| else |
| 0 |
| ) |
| )). |
| flatMap(_.drop(1)). |
| toSeq |
| |
| redundant.foreach(sen.removeNote) |
| |
| var delCombs: Seq[NCNlpSentenceNote] = |
| getNotNlpNotes(sen). |
| flatMap(note ⇒ getNotNlpNotes(note.tokenIndexes.sorted.map(i ⇒ sen(i))).filter(_ != note)). |
| distinct |
| |
| // Optimization. Deletes all wholly swallowed notes. |
| val links = getLinks(sen.flatten) |
| |
| val swallowed = |
| delCombs. |
| // There aren't links on it. |
| filter(n ⇒ !links.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))). |
| // It doesn't have links. |
| filter(getPartKeys(_).isEmpty). |
| flatMap(note ⇒ { |
| val noteWordsIdxs = note.wordIndexes.toSet |
| val key = PartKey(note, sen) |
| |
| val delCombOthers = |
| delCombs.filter(_ != note).flatMap(n ⇒ if (getPartKeys(n).contains(key)) Some(n) else None) |
| |
| if ( |
| delCombOthers.exists(o ⇒ noteWordsIdxs == o.wordIndexes.toSet) || |
| delCombOthers.nonEmpty && !delCombOthers.exists(o ⇒ noteWordsIdxs.subsetOf(o.wordIndexes.toSet)) |
| ) |
| Some(note) |
| else |
| None |
| }) |
| |
| delCombs = delCombs.filter(p ⇒ !swallowed.contains(p)) |
| addDeleted(sen, sen, swallowed) |
| swallowed.foreach(sen.removeNote) |
| |
| var sens = |
| if (delCombs.nonEmpty) { |
| val toksByIdx = |
| delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)). |
| groupBy { case (idx, _) ⇒ idx }. |
| map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note }.toSet }. |
| toSeq.sortBy(-_.size) |
| |
| val seqSens = |
| NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala). |
| par. |
| flatMap(delComb ⇒ { |
| val nsClone = sen.clone() |
| |
| // Saves deleted notes for sentence and their tokens. |
| addDeleted(sen, nsClone, delComb) |
| delComb.foreach(nsClone.removeNote) |
| |
| // Has overlapped notes for some tokens. |
| require(!nsClone.exists(_.count(!_.isNlp) > 1)) |
| |
| collapse0(nsClone) |
| }).seq |
| |
| // It removes sentences which have only one difference - 'direct' flag of their user tokens. |
| // `Direct` sentences have higher priority. |
| type Key = Seq[Map[String, JSerializable]] |
| case class Holder(key: Key, sentence: NCNlpSentence, factor: Int) |
| |
| def mkHolder(sen: NCNlpSentence): Holder = { |
| val notes = sen.flatten |
| |
| Holder( |
| // We have to delete some keys to have possibility to compare sentences. |
| notes.map(_.clone().filter { case (name, _) ⇒ name != "direct" }), |
| sen, |
| notes.filter(_.isNlp).map(p ⇒ if (p.isDirect) 0 else 1).sum |
| ) |
| } |
| |
| seqSens.par.map(mkHolder).seq.groupBy(_.key).map { case (_, seq) ⇒ seq.minBy(_.factor).sentence }.toSeq |
| } |
| else |
| collapse0(sen).flatMap(p ⇒ Option(Seq(p))).getOrElse(Seq.empty) |
| |
| sens = sens.distinct |
| |
| sens.par.foreach(sen ⇒ |
| sen.foreach(tok ⇒ |
| tok.size match { |
| case 1 ⇒ require(tok.head.isNlp, s"Unexpected non-'nlpcraft:nlp' token: $tok") |
| case 2 ⇒ require(tok.head.isNlp ^ tok.last.isNlp, s"Unexpected token notes: $tok") |
| case _ ⇒ require(requirement = false, s"Unexpected token notes count: $tok") |
| } |
| ) |
| ) |
| |
| // Drops similar sentences (with same tokens structure). |
| // Among similar sentences we prefer one with minimal free words count. |
| sens.groupBy(_.flatten.filter(!_.isNlp).map(_.getKey(withIndexes = false))). |
| map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }. |
| toSeq |
| } |
| |
| override def start(parent: Span): NCService = { |
| ackStarting() |
| |
| pool = new java.util.concurrent.ForkJoinPool() |
| |
| ackStarted() |
| } |
| |
| override def stop(parent: Span): Unit = { |
| ackStopping() |
| |
| U.shutdownPool(pool) |
| |
| ackStopped() |
| } |
| |
| /** |
| * |
| * @param mdl |
| * @param sen |
| * @param lastPhase |
| * @return |
| */ |
| def collapse(mdl: NCModel, sen: NCNlpSentence, lastPhase: Boolean = false): Seq[NCNlpSentence] = |
| collapseSentence(sen, mdl, lastPhase) |
| } |