nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala - incubator-nlpcraft - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      https://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nlpcraft.common.nlp

 import org.apache.nlpcraft.common.U
 import org.apache.nlpcraft.common.ascii._

 import java.io.{Serializable => JSerializable}
 import java.util.{List => JList}

 import scala.collection.mutable
 import scala.language.implicitConversions
 import scala.jdk.CollectionConverters.{CollectionHasAsScala, SeqHasAsJava}

 /**
   * Sentence token note is a typed map of KV pairs.
   */
 class NCNlpSentenceNote(private val values: Map[String, JSerializable]) extends JSerializable with NCAsciiLike {
     import NCNlpSentenceNote._

     private lazy val dataWithoutIndexes = this.filter(p => !SKIP_CLONE.contains(p._1))
     private lazy val skipNlp = dataWithoutIndexes.filter { case (key, _) => key != "noteType" }

     @transient
     private lazy val hash = values.hashCode()

     // Shortcuts for mandatory fields. (Immutable fields)
     lazy val noteType: String = values("noteType").asInstanceOf[String]
     lazy val tokenFrom: Int = values("tokMinIndex").asInstanceOf[Int] // First index.
     lazy val tokenTo: Int = values("tokMaxIndex").asInstanceOf[Int] // Last index.
     lazy val tokenIndexes: Seq[Int] = values("tokWordIndexes").asInstanceOf[JList[Int]].asScala.toSeq // Includes 1st and last indices too.
     lazy val wordIndexes: Seq[Int] = values("wordIndexes").asInstanceOf[JList[Int]].asScala.toSeq // Includes 1st and last indices too.
     lazy val wordIndexesSet: Set[Int] = wordIndexes.toSet
     lazy val sparsity: Int = values("sparsity").asInstanceOf[Int]
     lazy val isDirect: Boolean = values("direct").asInstanceOf[Boolean]
     lazy val isUser: Boolean = {
         val i = noteType.indexOf(":")

         if (i > 0) !TOK_PREFIXES.contains(noteType.take(i)) else true
     }

     lazy val isSystem: Boolean = !isUser
     lazy val isNlp: Boolean = noteType == "nlpcraft:nlp"

     // Typed getter.
     def data[T](key: String): T = values(key).asInstanceOf[T]
     def dataOpt[T](key: String): Option[T] = values.get(key).asInstanceOf[Option[T]]

     override def equals(obj: Any): Boolean = obj match {
         case h: NCNlpSentenceNote => h.hashCode() == hashCode() && h.values == values
         case _ => false
     }

     override def hashCode(): Int = hash

     /**
       * Clones this note.
       */
     def clone(indexes: Seq[Int], wordIndexes: Seq[Int], params: (String, JSerializable)*): NCNlpSentenceNote =
         apply(
             indexes,
             Some(wordIndexes),
             noteType,
             dataWithoutIndexes ++ params.toMap
         )

     override def clone(): NCNlpSentenceNote = new NCNlpSentenceNote(values)

     /**
       *
       * @param n
       */
     def equalsWithoutIndexes(n: NCNlpSentenceNote): Boolean =
         this.noteType == n.noteType &&
         this.wordIndexes.size == n.wordIndexes.size &&
         this.wordIndexes.zip(n.wordIndexes).map(p => p._1 - p._2).distinct.size == 1 &&
         this.dataWithoutIndexes == n.dataWithoutIndexes

     /**
       *
       * @return
       */
     override def toAscii: String =
         values.iterator.toSeq.sortBy(_._1).foldLeft(NCAsciiTable("Key", "Value"))((t, p) => t += p).toString

     /**
       *
       * @return
       */
     def asMetadata(): Map[String, JSerializable] =
         if (isUser)
             values.get("meta") match {
                 case Some(meta) => meta.asInstanceOf[Map[String, JSerializable]]
                 case None => Map.empty[String, JSerializable]
             }
         else {
             val md = mutable.Map.empty[String, JSerializable]

             val m = if (noteType != "nlpcraft:nlp") skipNlp else values

             m.foreach { case (name, value) => md += (name.toLowerCase() -> value)}

             md.toMap
         }

     /**
      *
      * @param kvs
      */
     def clone(kvs : (String, JSerializable)*): NCNlpSentenceNote =
         new NCNlpSentenceNote(values ++ kvs)

     /**
       *
       * @param withIndexes
       * @param withReferences
       * @return
       */
     def getKey(withIndexes: Boolean = true, withReferences: Boolean = true): Seq[Any] = {
         val seq1 = if (withIndexes) Seq(wordIndexes, noteType) else Seq(noteType)
         val seq2 = if (isUser)
             Seq.empty
         else
             getBuiltProperties(noteType, withReferences).map(name => this.getOrElse(name, null))

         seq1 ++ seq2
     }

     /**
       *
       * @return
       */
     override def toString: String =
         values.toSeq.sortBy(t => { // Don't show internal ID.
             val typeSort = t._1 match {
                 case "noteType" => 0
                 case "origText" => 1
                 case "wordIndexes" => 2
                 case "direct" => 3
                 case "sparsity" => 4
                 case "parts" => 5

                 case _ => 100
             }
             (typeSort, t._1)
         }).map(p => s"${p._1}=${p._2}").mkString("NLP note [", ", ", "]")
 }

 object NCNlpSentenceNote {
     // These properties should be cloned as they are auto-set when new clone
     // is created.
     private final val SKIP_CLONE: Set[String] = Set(
         "minIndex",
         "maxIndex",
         "wordIndexes",
         "wordLength",
         "tokMinIndex",
         "tokMaxIndex",
         "tokWordIndexes",
         "sparsity"
     )

     private final val TOK_PREFIXES = Set("nlpcraft", "google", "opennlp", "stanford", "spacy")

     /**
      * To immutable map.
      */
     implicit def values(note: NCNlpSentenceNote): Map[String, JSerializable] = note.values

     /**
       * Creates new note with given parameters.
       *
       * @param indexes Indexes in the sentence.
       * @param wordIndexesOpt Word indexes. Optional.
       * @param typ Type of the node.
       * @param params Parameters.
       */
     def apply(
         indexes: Seq[Int],
         wordIndexesOpt: Option[Seq[Int]],
         typ: String,
         params: Map[String, Any]
     ): NCNlpSentenceNote = {
         def calc(seq: Seq[Int]): (Int, Int, Int, JList[Int], Int) =
             (U.calcSparsity(seq), seq.min, seq.max, seq.asJava, seq.length)

         val (sparsity, tokMinIndex, tokMaxIndex, tokWordIndexes, len) = calc(wordIndexesOpt.getOrElse(indexes))

         new NCNlpSentenceNote(
             params.filter(_._2 != null).map(p => p._1 -> p._2.asInstanceOf[JSerializable]) ++
             Map[String, JSerializable](
                "noteType" -> typ,
                "tokMinIndex" -> indexes.min,
                "tokMaxIndex" -> indexes.max,
                "tokWordIndexes" -> indexes.asJava.asInstanceOf[JSerializable],
                "minIndex" -> tokMinIndex,
                "maxIndex" -> tokMaxIndex,
                "wordIndexes" -> tokWordIndexes.asInstanceOf[JSerializable],
                "wordLength" -> len,
                "sparsity" -> sparsity
             )
         )
     }

     /**
       * Creates new note with given parameters.
       *
       * @param indexes Indexes in the sentence.
       * @param typ Type of the node.
       * @param params Parameters.
       */
     def apply(indexes: Seq[Int], typ: String, params: (String, Any)*): NCNlpSentenceNote =
         apply(indexes, None, typ, params.toMap)

     /**
      * Creates new note with given parameters.
      *
      * @param indexes Indexes in the sentence.
      * @param typ Type of the node.
      * @param params Parameters.
      */
     def apply(indexes: mutable.Seq[Int], typ: String, params: (String, Any)*): NCNlpSentenceNote =
         apply(indexes.toSeq, None, typ, params.toMap)

     /**
       * Creates new note with given parameters.
       *
       * @param indexes Indexes in the sentence.
       * @param wordIndexes Word indexes in the sentence.
       * @param typ Type of the node.
       * @param params Parameters.
       */
     def apply(indexes: Seq[Int], wordIndexes: Seq[Int], typ: String, params: (String, Any)*): NCNlpSentenceNote =
         apply(indexes, Some(wordIndexes), typ, params.toMap)

     /**
      * Creates new note with given parameters.
      *
      * @param indexes Indexes in the sentence.
      * @param wordIndexes Word indexes in the sentence.
      * @param typ Type of the node.
      * @param params Parameters.
      */
     def apply(indexes: mutable.Seq[Int], wordIndexes: mutable.Seq[Int], typ: String, params: (String, Any)*): NCNlpSentenceNote =
         apply(indexes.toSeq, Some(wordIndexes.toSeq), typ, params.toMap)

     /**
       *
       * @param noteType
       * @param withReferences
       */
     def getBuiltProperties(noteType: String, withReferences: Boolean = true): Seq[String] = {
         def addRefs(names: String*): Seq[String] = if (withReferences) names else Seq.empty

         noteType match {
             case "nlpcraft:nlp" => Seq.empty

             case "nlpcraft:continent" => Seq("continent")
             case "nlpcraft:subcontinent" => Seq("continent", "subcontinent")
             case "nlpcraft:country" => Seq("continent", "subcontinent", "country")
             case "nlpcraft:region" => Seq("continent", "subcontinent", "country", "region")
             case "nlpcraft:city" => Seq("continent", "subcontinent", "country", "region", "city")
             case "nlpcraft:metro" => Seq("metro")
             case "nlpcraft:date" => Seq("from", "to")
             case "nlpcraft:relation" => Seq("type", "note") ++ addRefs("indexes")
             case "nlpcraft:sort" => Seq("asc", "subjnotes", "bynotes") ++ addRefs("subjindexes", "byindexes")
             case "nlpcraft:limit" => Seq("limit", "note") ++ addRefs("indexes", "asc") // Asc flag has sense only with references for limit.
             case "nlpcraft:coordinate" => Seq("latitude", "longitude")
             case "nlpcraft:num" => Seq("from", "to", "unit", "unitType")
             case x if x.startsWith("google:") => Seq("meta", "mentionsBeginOffsets", "mentionsContents", "mentionsTypes")
             case x if x.startsWith("stanford:") => Seq("nne")
             case x if x.startsWith("opennlp:") => Seq.empty
             case x if x.startsWith("spacy:") => Seq("vector")

             case _ => throw new AssertionError(s"Unexpected note type: $noteType")
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* https://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nlpcraft.common.nlp

	import org.apache.nlpcraft.common.U
	import org.apache.nlpcraft.common.ascii._

	import java.io.{Serializable => JSerializable}
	import java.util.{List => JList}

	import scala.collection.mutable
	import scala.language.implicitConversions
	import scala.jdk.CollectionConverters.{CollectionHasAsScala, SeqHasAsJava}

	/**
	* Sentence token note is a typed map of KV pairs.
	*/
	class NCNlpSentenceNote(private val values: Map[String, JSerializable]) extends JSerializable with NCAsciiLike {
	import NCNlpSentenceNote._

	private lazy val dataWithoutIndexes = this.filter(p => !SKIP_CLONE.contains(p._1))
	private lazy val skipNlp = dataWithoutIndexes.filter { case (key, _) => key != "noteType" }

	@transient
	private lazy val hash = values.hashCode()

	// Shortcuts for mandatory fields. (Immutable fields)
	lazy val noteType: String = values("noteType").asInstanceOf[String]
	lazy val tokenFrom: Int = values("tokMinIndex").asInstanceOf[Int] // First index.
	lazy val tokenTo: Int = values("tokMaxIndex").asInstanceOf[Int] // Last index.
	lazy val tokenIndexes: Seq[Int] = values("tokWordIndexes").asInstanceOf[JList[Int]].asScala.toSeq // Includes 1st and last indices too.
	lazy val wordIndexes: Seq[Int] = values("wordIndexes").asInstanceOf[JList[Int]].asScala.toSeq // Includes 1st and last indices too.
	lazy val wordIndexesSet: Set[Int] = wordIndexes.toSet
	lazy val sparsity: Int = values("sparsity").asInstanceOf[Int]
	lazy val isDirect: Boolean = values("direct").asInstanceOf[Boolean]
	lazy val isUser: Boolean = {
	val i = noteType.indexOf(":")

	if (i > 0) !TOK_PREFIXES.contains(noteType.take(i)) else true
	}

	lazy val isSystem: Boolean = !isUser
	lazy val isNlp: Boolean = noteType == "nlpcraft:nlp"

	// Typed getter.
	def data[T](key: String): T = values(key).asInstanceOf[T]
	def dataOpt[T](key: String): Option[T] = values.get(key).asInstanceOf[Option[T]]

	override def equals(obj: Any): Boolean = obj match {
	case h: NCNlpSentenceNote => h.hashCode() == hashCode() && h.values == values
	case _ => false
	}

	override def hashCode(): Int = hash

	/**
	* Clones this note.
	*/
	def clone(indexes: Seq[Int], wordIndexes: Seq[Int], params: (String, JSerializable)*): NCNlpSentenceNote =
	apply(
	indexes,
	Some(wordIndexes),
	noteType,
	dataWithoutIndexes ++ params.toMap
	)

	override def clone(): NCNlpSentenceNote = new NCNlpSentenceNote(values)

	/**
	*
	* @param n
	*/
	def equalsWithoutIndexes(n: NCNlpSentenceNote): Boolean =
	this.noteType == n.noteType &&
	this.wordIndexes.size == n.wordIndexes.size &&
	this.wordIndexes.zip(n.wordIndexes).map(p => p._1 - p._2).distinct.size == 1 &&
	this.dataWithoutIndexes == n.dataWithoutIndexes

	/**
	*
	* @return
	*/
	override def toAscii: String =
	values.iterator.toSeq.sortBy(_._1).foldLeft(NCAsciiTable("Key", "Value"))((t, p) => t += p).toString

	/**
	*
	* @return
	*/
	def asMetadata(): Map[String, JSerializable] =
	if (isUser)
	values.get("meta") match {
	case Some(meta) => meta.asInstanceOf[Map[String, JSerializable]]
	case None => Map.empty[String, JSerializable]
	}
	else {
	val md = mutable.Map.empty[String, JSerializable]

	val m = if (noteType != "nlpcraft:nlp") skipNlp else values

	m.foreach { case (name, value) => md += (name.toLowerCase() -> value)}

	md.toMap
	}

	/**
	*
	* @param kvs
	*/
	def clone(kvs : (String, JSerializable)*): NCNlpSentenceNote =
	new NCNlpSentenceNote(values ++ kvs)

	/**
	*
	* @param withIndexes
	* @param withReferences
	* @return
	*/
	def getKey(withIndexes: Boolean = true, withReferences: Boolean = true): Seq[Any] = {
	val seq1 = if (withIndexes) Seq(wordIndexes, noteType) else Seq(noteType)
	val seq2 = if (isUser)
	Seq.empty
	else
	getBuiltProperties(noteType, withReferences).map(name => this.getOrElse(name, null))

	seq1 ++ seq2
	}

	/**
	*
	* @return
	*/
	override def toString: String =
	values.toSeq.sortBy(t => { // Don't show internal ID.
	val typeSort = t._1 match {
	case "noteType" => 0
	case "origText" => 1
	case "wordIndexes" => 2
	case "direct" => 3
	case "sparsity" => 4
	case "parts" => 5

	case _ => 100
	}
	(typeSort, t._1)
	}).map(p => s"${p._1}=${p._2}").mkString("NLP note [", ", ", "]")
	}

	object NCNlpSentenceNote {
	// These properties should be cloned as they are auto-set when new clone
	// is created.
	private final val SKIP_CLONE: Set[String] = Set(
	"minIndex",
	"maxIndex",
	"wordIndexes",
	"wordLength",
	"tokMinIndex",
	"tokMaxIndex",
	"tokWordIndexes",
	"sparsity"
	)

	private final val TOK_PREFIXES = Set("nlpcraft", "google", "opennlp", "stanford", "spacy")

	/**
	* To immutable map.
	*/
	implicit def values(note: NCNlpSentenceNote): Map[String, JSerializable] = note.values

	/**
	* Creates new note with given parameters.
	*
	* @param indexes Indexes in the sentence.
	* @param wordIndexesOpt Word indexes. Optional.
	* @param typ Type of the node.
	* @param params Parameters.
	*/
	def apply(
	indexes: Seq[Int],
	wordIndexesOpt: Option[Seq[Int]],
	typ: String,
	params: Map[String, Any]
	): NCNlpSentenceNote = {
	def calc(seq: Seq[Int]): (Int, Int, Int, JList[Int], Int) =
	(U.calcSparsity(seq), seq.min, seq.max, seq.asJava, seq.length)

	val (sparsity, tokMinIndex, tokMaxIndex, tokWordIndexes, len) = calc(wordIndexesOpt.getOrElse(indexes))

	new NCNlpSentenceNote(
	params.filter(_._2 != null).map(p => p._1 -> p._2.asInstanceOf[JSerializable]) ++
	Map[String, JSerializable](
	"noteType" -> typ,
	"tokMinIndex" -> indexes.min,
	"tokMaxIndex" -> indexes.max,
	"tokWordIndexes" -> indexes.asJava.asInstanceOf[JSerializable],
	"minIndex" -> tokMinIndex,
	"maxIndex" -> tokMaxIndex,
	"wordIndexes" -> tokWordIndexes.asInstanceOf[JSerializable],
	"wordLength" -> len,
	"sparsity" -> sparsity
	)
	)
	}

	/**
	* Creates new note with given parameters.
	*
	* @param indexes Indexes in the sentence.
	* @param typ Type of the node.
	* @param params Parameters.
	*/
	def apply(indexes: Seq[Int], typ: String, params: (String, Any)*): NCNlpSentenceNote =
	apply(indexes, None, typ, params.toMap)

	/**
	* Creates new note with given parameters.
	*
	* @param indexes Indexes in the sentence.
	* @param typ Type of the node.
	* @param params Parameters.
	*/
	def apply(indexes: mutable.Seq[Int], typ: String, params: (String, Any)*): NCNlpSentenceNote =
	apply(indexes.toSeq, None, typ, params.toMap)

	/**
	* Creates new note with given parameters.
	*
	* @param indexes Indexes in the sentence.
	* @param wordIndexes Word indexes in the sentence.
	* @param typ Type of the node.
	* @param params Parameters.
	*/
	def apply(indexes: Seq[Int], wordIndexes: Seq[Int], typ: String, params: (String, Any)*): NCNlpSentenceNote =
	apply(indexes, Some(wordIndexes), typ, params.toMap)

	/**
	* Creates new note with given parameters.
	*
	* @param indexes Indexes in the sentence.
	* @param wordIndexes Word indexes in the sentence.
	* @param typ Type of the node.
	* @param params Parameters.
	*/
	def apply(indexes: mutable.Seq[Int], wordIndexes: mutable.Seq[Int], typ: String, params: (String, Any)*): NCNlpSentenceNote =
	apply(indexes.toSeq, Some(wordIndexes.toSeq), typ, params.toMap)

	/**
	*
	* @param noteType
	* @param withReferences
	*/
	def getBuiltProperties(noteType: String, withReferences: Boolean = true): Seq[String] = {
	def addRefs(names: String*): Seq[String] = if (withReferences) names else Seq.empty

	noteType match {
	case "nlpcraft:nlp" => Seq.empty

	case "nlpcraft:continent" => Seq("continent")
	case "nlpcraft:subcontinent" => Seq("continent", "subcontinent")
	case "nlpcraft:country" => Seq("continent", "subcontinent", "country")
	case "nlpcraft:region" => Seq("continent", "subcontinent", "country", "region")
	case "nlpcraft:city" => Seq("continent", "subcontinent", "country", "region", "city")
	case "nlpcraft:metro" => Seq("metro")
	case "nlpcraft:date" => Seq("from", "to")
	case "nlpcraft:relation" => Seq("type", "note") ++ addRefs("indexes")
	case "nlpcraft:sort" => Seq("asc", "subjnotes", "bynotes") ++ addRefs("subjindexes", "byindexes")
	case "nlpcraft:limit" => Seq("limit", "note") ++ addRefs("indexes", "asc") // Asc flag has sense only with references for limit.
	case "nlpcraft:coordinate" => Seq("latitude", "longitude")
	case "nlpcraft:num" => Seq("from", "to", "unit", "unitType")
	case x if x.startsWith("google:") => Seq("meta", "mentionsBeginOffsets", "mentionsContents", "mentionsTypes")
	case x if x.startsWith("stanford:") => Seq("nne")
	case x if x.startsWith("opennlp:") => Seq.empty
	case x if x.startsWith("spacy:") => Seq("vector")

	case _ => throw new AssertionError(s"Unexpected note type: $noteType")
	}
	}
	}