blob: 7bbe5ac36b980e059b1beda65ddcd6f2a4448ec6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nlpcraft.server.nlp.enrichers.basenlp
import io.opencensus.trace.Span
import org.apache.nlpcraft.common._
import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
import org.apache.nlpcraft.server.nlp.core.{NCNlpParser, NCNlpServerManager}
import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher
import scala.collection._
/**
* Base NLP enricher.
*/
object NCBaseNlpEnricher extends NCServerEnricher {
//noinspection SpellCheckingInspection
private final val INTERJECTIONS =
// http://www.vidarholen.net/contents/interjections/
Set(
"aah", "aaah", "aaaahh", "aha", "a-ha", "ahem",
"ahh", "ahhh", "argh", "augh", "aww", "aw",
"awww", "aww", "aw", "ohh", "ahh", "aw",
"oh", "bah", "boo", "booh", "brr", "brrrr",
"duh", "eek", "eeeek", "eep", "eh", "huh",
"eh", "huh", "eyh", "eww", "ugh", "ewww",
"gah", "gee", "grr", "grrrr", "hmm", "hm",
"hmmmm", "humph", "harumph", "huh", "hurrah", "hooray",
"huzzah", "ich", "yuck", "yak", "meh", "eh",
"mhm", "mmhm", "uh-hu", "mm", "mmm", "mmh",
"muahaha", "mwahaha", "bwahaha", "nah", "nuh-uh", "nuh-hu",
"nuhuh", "oh", "ooh-la-la", "oh-lala", "ooh", "oooh",
"oomph", "umph", "oops", "ow", "oww", "ouch",
"oy", "oi", "oyh", "oy", "oyvay", "oy-vay",
"pew", "pee-yew", "pff", "pffh", "pssh", "pfft",
"phew", "psst", "sheesh", "jeez", "shh", "hush",
"shush", "shoo", "tsk-tsk", "tut-tut", "uh-hu", "uhuh",
"mhm", "uh-oh", "oh-oh", "uh-uh", "unh-unh", "uhh",
"uhm", "err", "wee", "whee", "weee", "whoa",
"wow", "yahoo", "yippie", "yay", "yeah", "yeeeeaah",
"yee-haw", "yeehaw", "yoo-hoo", "yoohoo", "yuh-uh", "yuh-hu",
"yuhuh", "yuck", "ich", "blech", "bleh", "zing",
"ba-dum-tss", "badum-tish"
).map(_.toLowerCase)
// The acronyms stand for (Left|Right) (Round|Square|Curly) Bracket.
// http://www.cis.upenn.edu/~treebank/tokenization.html
private final val BRACKETS = Map(
"-LRB-""(",
"-RRB-"")",
"-LSB-""[",
"-RSB-""]",
"-LCB-""{",
"-RCB-""}"
)
@volatile private var parser: NCNlpParser = _
/**
*
* @param parent Optional parent span.
* @return
*/
override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
ackStarting()
parser = NCNlpServerManager.getParser
ackStarted()
}
/**
*
* @param parent Optional parent span.
*/
override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
ackStopping()
ackStopped()
}
/**
*
* @param ns NLP sentence to enrich.
* @param parent Optional parent span.
* @throws NCE
*/
@throws[NCE]
override def enrich(ns: NCNlpSentence, parent: Span = null) {
require(isStarted)
startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, "txt" → ns.text) { _ ⇒
// This must be 1st enricher in the pipeline.
assume(ns.isEmpty)
var idx = 0
for (word ← parser.parse(ns.text)) {
val value = word.word.toLowerCase
val origTxt = word.word
val tok = NCNlpSentenceToken(idx)
// Override interjection (UH) analysis.
// (INTERJECTIONS and lemma are should be in lowercase.)
val pos = if (INTERJECTIONS.contains(word.lemma)) "UH" else word.pos
val seq = mutable.ArrayBuffer(
"lemma" → processBracket(word.lemma),
"index" → idx,
"pos" → pos,
"origText" → processBracket(origTxt),
"normText" → processBracket(value),
"charLength" → value.length,
"stem" → processBracket(word.stem),
"posDesc"NCPennTreebank.description(pos).getOrElse(pos),
"start" → word.start,
"end" → word.end,
"quoted"false,
"stopWord"false,
"bracketed"false,
"direct"true
)
tok.add(NCNlpSentenceNote(Seq(idx), "nlpcraft:nlp", seq: _*))
// Add new token to NLP sentence.
ns += tok
idx += 1
}
}
}
private def processBracket(s: String): String = BRACKETS.getOrElse(s.toUpperCase, s)
}