blob: 792b2c68d7b798e5373b0e8380f601e21341cd3e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nlpcraft.server.nlp.enrichers
import com.typesafe.scalalogging.LazyLogging
import io.opencensus.trace.Span
import org.apache.nlpcraft.common._
import org.apache.nlpcraft.common.NCService
import org.apache.nlpcraft.common.nlp._
import scala.collection._
/**
* Base trait for all server enricher.
*/
abstract class NCServerEnricher extends NCService with LazyLogging {
/**
* Attempts to enrich given NLP sentence in an isolation.
*
* @param ns NLP sentence to enrich.
* @param parent Optional parent span.
*/
@throws[NCE]
def enrich(ns: NCNlpSentence, parent: Span = null): Unit
// Utility functions.
final protected def toStemKey(toks: Seq[NCNlpSentenceToken]): String = toks.map(_.stem).mkString(" ")
final protected def toLemmaKey(toks: Seq[NCNlpSentenceToken]): String = toks.map(_.lemma).mkString(" ")
final protected def toValueKey(toks: Seq[NCNlpSentenceToken]): String = toks.map(_.origText.toLowerCase).mkString(" ")
final protected def toOriginalKey(toks: Seq[NCNlpSentenceToken]): String = toks.map(_.origText).mkString(" ")
}
object NCServerEnricher {
// Penn Treebank POS tags for opening & closing quotes.
private val Q_POS = Set("``", "''")
// Stanford POS for opening & closing brackets.
// NOTE: it's different from standard Penn Treebank.
private val LB_POS = "-LRB-"
private val RB_POS = "-RRB-"
def isQuote(t: NCNlpSentenceToken): Boolean = Q_POS.contains(t.pos)
def isLBR(t: NCNlpSentenceToken): Boolean = t.pos == LB_POS
def isRBR(t: NCNlpSentenceToken): Boolean = t.pos == RB_POS
def isBR(t: NCNlpSentenceToken): Boolean = isLBR(t) || isRBR(t)
/**
* Skips spaces after left and before right brackets.
*
* @param toks
* @param get
*/
def mkSumString(toks: Seq[NCNlpSentenceToken], get: NCNlpSentenceTokenString): String = {
val buf = mutable.Buffer.empty[String]
val n = toks.size
toks.zipWithIndex.foreach(p ⇒ {
val t = p._1
val idx = p._2
buf += get(t)
def isNextTokenRB: Boolean = idx <= n - 2 && isRBR(toks(idx + 1))
def isLast: Boolean = idx == n - 1
def isSolidWithNext: Boolean = idx <= n - 2 && t.endCharIndex == toks(idx + 1).startCharIndex
if (
!isLBR(t) &&
!isNextTokenRB &&
!isLast &&
!isSolidWithNext
)
buf += " "
})
buf.mkString
}
}