blob: 1c66da1be6c9f9ebac0d8333500047d9f77b0af5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nlpcraft.common.nlp
import org.apache.nlpcraft.common.U
import org.apache.nlpcraft.common.nlp.pos._
import java.util.{List => JList}
import scala.collection.mutable
import scala.jdk.CollectionConverters.ListHasAsScala
import scala.language.implicitConversions
/**
* NLP token is a collection of NLP notes associated with that token.
*/
case class NCNlpSentenceToken(
index: Int,
private val notes: mutable.HashSet[NCNlpSentenceNote] = mutable.HashSet.empty[NCNlpSentenceNote],
stopsReasons: mutable.HashSet[NCNlpSentenceNote] = mutable.HashSet.empty[NCNlpSentenceNote]
) extends java.io.Serializable {
@transient
private var nlpNote: NCNlpSentenceNote = _
// Shortcuts for some frequently used *mandatory* notes.
def normText: String = getNlpValue[String]("normText")
def startCharIndex: Int = getNlpValue[Int]("start").intValue() // Start character index.
def endCharIndex: Int = getNlpValue[Int]("end").intValue() // End character index.
def origText: String = getNlpValue[String]("origText")
def words: Int = origText.split(" ").length
def wordLength: Int = getNlpValue[Int]("wordLength").intValue()
def wordIndexes: Seq[Int] = getNlpValue[JList[Int]]("wordIndexes").asScala.toSeq
def pos: String = getNlpValue[String]("pos")
def posDesc: String = getNlpValue[String]( "posDesc")
def lemma: String = getNlpValue[String]("lemma")
def stem: String = getNlpValue[String]("stem")
def isStopWord: Boolean = getNlpValue[Boolean]("stopWord")
def isBracketed: Boolean = getNlpValue[Boolean]("bracketed")
def isDirect: Boolean = getNlpValue[Boolean]("direct")
def isQuoted: Boolean = getNlpValue[Boolean]("quoted")
def isSynthetic: Boolean = NCPennTreebank.isSynthetic(pos)
def isKnownWord: Boolean = getNlpValue[Boolean]("dict")
def isSwearWord: Boolean = getNlpValue[Boolean]("swear")
def isEnglish: Boolean = getNlpValue[Boolean]("english")
@transient
private var hash: java.lang.Integer = _
//noinspection HashCodeUsesVar
override def hashCode(): Int = {
if (hash == null)
hash = U.mkJavaHash(index, notes, stopsReasons)
hash
}
override def equals(obj: Any): Boolean = obj match {
case x: NCNlpSentenceToken => x.index == index && x.notes == notes && x.stopsReasons == stopsReasons
case _ => false
}
/**
*
* @param noteType Note type.
*/
def getNotes(noteType: String): Iterable[NCNlpSentenceNote] = notes.filter(_.noteType == noteType)
/**
* Clones note.
* Shallow copy.
*/
def clone(index: Int): NCNlpSentenceToken =
NCNlpSentenceToken(index, mutable.HashSet.empty[NCNlpSentenceNote] ++ notes.clone(), stopsReasons.clone())
/**
* Clones note.
* Shallow copy.
*/
override def clone(): NCNlpSentenceToken = clone(index)
/**
* Removes note with given ID. No-op if ID wasn't found.
*
* @param note Note.
*/
def remove(note: NCNlpSentenceNote): Unit = {
notes.remove(note)
hash = null
}
/**
* Tests whether or not this token contains note.
* It is important to convert notes to set each time,
* because otherwise note cannot be found because its content changed and its hashCode changed too.
* https://stackoverflow.com/questions/43553806/hashset-contains-returns-false-when-it-shouldnt/43554123
*/
def contains(note: NCNlpSentenceNote): Boolean = notes.contains(note)
/**
*
* @param noteType Note type.
* @param noteName Note name.
*/
def getNoteOpt(noteType: String, noteName: String): Option[NCNlpSentenceNote] = {
val ns = getNotes(noteType).filter(_.contains(noteName))
ns.size match {
case 0 => None
case 1 => Some(ns.head)
case _ =>
throw new AssertionError(
s"Multiple notes found [type=$noteType, name=$noteName, token=$notes]"
)
}
}
/**
* Gets note with given type and name.
*
* @param noteType Note type.
* @param noteName Note name.
*/
def getNote(noteType: String, noteName: String): NCNlpSentenceNote =
getNoteOpt(noteType, noteName) match {
case Some(n) => n
case None =>
throw new AssertionError(s"Note not found [type=$noteType, name=$noteName, token=$notes]")
}
/**
* Gets NLP note.
*/
def getNlpNote: NCNlpSentenceNote = {
if (nlpNote == null)
nlpNote = notes.find(_.isNlp).orNull
nlpNote
}
/**
*
* @param noteName Note name.
* @tparam T Type of the note value.
*/
def getNlpValueOpt[T: Manifest](noteName: String): Option[T] =
getNlpNote.get(noteName) match {
case Some(v) => Some(v.asInstanceOf[T])
case None => None
}
/**
*
* @param noteName Note name.
* @tparam T Type of the note value.
*/
def getNlpValue[T: Manifest](noteName: String): T = getNlpNote(noteName).asInstanceOf[T]
/**
* Tests if this token has any notes of given type(s).
*
* @param types Note type(s) to check.
*/
def isTypeOf(types: String*): Boolean = types.exists(t => notes.exists(_.noteType == t))
/**
* Adds element.
*
* @param note Element.
*/
def add(note: NCNlpSentenceNote): Unit = {
hash = null
val added = notes.add(note)
if (added && note.isNlp)
nlpNote = note
}
/**
* Simple word is a non synthetic word that's also not part of any domain-specific note type.
*/
def isNlp: Boolean = notes.forall(_.isNlp)
/**
*
* @return
*/
def isUser: Boolean = notes.exists(_.isUser)
/**
*
* @param reason
*/
def addStopReason(reason: NCNlpSentenceNote): Unit = stopsReasons += reason
override def toString: String =
notes.toSeq.sortBy(t => (if (t.isNlp) 0 else 1, t.noteType)).mkString("NLP token [", "|", "]")
}
object NCNlpSentenceToken {
/**
* To immutable iterator.
*/
implicit def notes(x: NCNlpSentenceToken): Iterable[NCNlpSentenceNote] = x.notes.toSet
}