blob: 7be0bee1c8792aa78d5054022f2020648a4e996f [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.nlpcraft.common.nlp
import org.apache.nlpcraft.common.nlp.pos._
import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.language.implicitConversions
* NLP token is a collection of NLP notes associated with that token.
case class NCNlpSentenceToken(
index: Int,
private val notes: mutable.HashSet[NCNlpSentenceNote] = mutable.HashSet.empty[NCNlpSentenceNote],
stopsReasons: mutable.HashSet[NCNlpSentenceNote] = mutable.HashSet.empty[NCNlpSentenceNote]
) extends {
private var nlpNote: NCNlpSentenceNote = _
// Shortcuts for some frequently used *mandatory* notes.
def normText: String = getNlpValue[String]("normText")
def startCharIndex: Int = getNlpValue[Int]("start").intValue() // Start character index.
def endCharIndex: Int = getNlpValue[Int]("end").intValue() // End character index.
def origText: String = getNlpValue[String]("origText")
def words: Int = origText.split(" ").length
def wordLength: Int = getNlpValue[Int]("wordLength").intValue()
def wordIndexes: Seq[Int] = getNlpValue[java.util.List[Int]]("wordIndexes").asScala
def pos: String = getNlpValue[String]("pos")
def posDesc: String = getNlpValue[String]( "posDesc")
def lemma: String = getNlpValue[String]("lemma")
def stem: String = getNlpValue[String]("stem")
def isStopWord: Boolean = getNlpValue[Boolean]("stopWord")
def isBracketed: Boolean = getNlpValue[Boolean]("bracketed")
def isDirect: Boolean = getNlpValue[Boolean]("direct")
def isQuoted: Boolean = getNlpValue[Boolean]("quoted")
def isSynthetic: Boolean = NCPennTreebank.isSynthetic(pos)
def isKnownWord: Boolean = getNlpValue[Boolean]("dict")
def isSwearWord: Boolean = getNlpValue[Boolean]("swear")
def isEnglish: Boolean = getNlpValue[Boolean]("english")
* @param noteType Note type.
def getNotes(noteType: String): Iterable[NCNlpSentenceNote] = notes.filter(_.noteType == noteType)
* Clones note.
* Shallow copy.
def clone(index: Int): NCNlpSentenceToken =
val m = mutable.HashSet.empty[NCNlpSentenceNote]
notes.foreach(n ⇒ m += n.clone())
* Clones note.
* Shallow copy.
override def clone(): NCNlpSentenceToken = clone(index)
* Removes note with given ID. No-op if ID wasn't found.
* @param note Note.
def remove(note: NCNlpSentenceNote): Unit = notes.remove(note)
* Tests whether or not this token contains note.
* It is important to convert notes to set each time,
* because otherwise note cannot be found because its content changed and its hashCode changed too.
def contains(note: NCNlpSentenceNote): Boolean = notes.contains(note)
* @param noteType Note type.
* @param noteName Note name.
def getNoteOpt(noteType: String, noteName: String): Option[NCNlpSentenceNote] = {
val ns = getNotes(noteType).filter(_.contains(noteName))
ns.size match {
case 0None
case 1Some(ns.head)
case _ ⇒
throw new AssertionError(
s"Multiple notes found [type=$noteType, name=$noteName, token=$notes]"
* Gets note with given type and name.
* @param noteType Note type.
* @param noteName Note name.
def getNote(noteType: String, noteName: String): NCNlpSentenceNote =
getNoteOpt(noteType, noteName) match {
case Some(n) ⇒ n
case None
throw new AssertionError(s"Note not found [type=$noteType, name=$noteName, token=$notes]")
* Gets NLP note.
def getNlpNote: NCNlpSentenceNote = {
if (nlpNote == null)
nlpNote = notes.find(_.isNlp).orNull
* @param noteName Note name.
* @tparam T Type of the note value.
def getNlpValueOpt[T: Manifest](noteName: String): Option[T] =
getNlpNote.get(noteName) match {
case Some(v)Some(v.asInstanceOf[T])
case NoneNone
* @param noteName Note name.
* @tparam T Type of the note value.
def getNlpValue[T: Manifest](noteName: String): T = getNlpNote(noteName).asInstanceOf[T]
* Tests if this token has any notes of given type(s).
* @param types Note type(s) to check.
def isTypeOf(types: String*): Boolean = types.exists(t ⇒ getNotes(t).nonEmpty)
* Adds element.
* @param note Element.
def add(note: NCNlpSentenceNote): Unit = {
val added = notes.add(note)
if (added && note.isNlp)
nlpNote = note
* Simple word is a non synthetic word that's also not part of any domain-specific note type.
def isNlp: Boolean = notes.forall(_.isNlp)
* @return
def isUser: Boolean = notes.exists(_.isUser)
* @param reason
def addStopReason(reason: NCNlpSentenceNote): Unit = stopsReasons += reason
override def toString: String =
notes.toSeq.sortBy(t ⇒ (if (t.isNlp) 0 else 1, t.noteType)).mkString("NLP token [", "|", "]")
object NCNlpSentenceToken {
* To immutable iterator.
implicit def notes(x: NCNlpSentenceToken): Iterable[NCNlpSentenceNote] = x.notes.toSet