nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/basenlp/NCBaseNlpEnricher.scala - incubator-nlpcraft - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nlpcraft.server.nlp.enrichers.basenlp

 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common._
 import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
 import org.apache.nlpcraft.server.nlp.core.{NCNlpParser, NCNlpServerManager}
 import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher

 import scala.collection._

 /**
   * Base NLP enricher.
   */
 object NCBaseNlpEnricher extends NCServerEnricher {
     //noinspection SpellCheckingInspection
     private final val INTERJECTIONS =
         // http://www.vidarholen.net/contents/interjections/
         Set(
             "aah", "aaah", "aaaahh", "aha", "a-ha", "ahem",
             "ahh", "ahhh", "argh", "augh", "aww", "aw",
             "awww", "aww", "aw", "ohh", "ahh", "aw",
             "oh", "bah", "boo", "booh", "brr", "brrrr",
             "duh", "eek", "eeeek", "eep", "eh", "huh",
             "eh", "huh", "eyh", "eww", "ugh", "ewww",
             "gah", "gee", "grr", "grrrr", "hmm", "hm",
             "hmmmm", "humph", "harumph", "huh", "hurrah", "hooray",
             "huzzah", "ich", "yuck", "yak", "meh", "eh",
             "mhm", "mmhm", "uh-hu", "mm", "mmm", "mmh",
             "muahaha", "mwahaha", "bwahaha", "nah", "nuh-uh", "nuh-hu",
             "nuhuh", "oh", "ooh-la-la", "oh-lala", "ooh", "oooh",
             "oomph", "umph", "oops", "ow", "oww", "ouch",
             "oy", "oi", "oyh", "oy", "oyvay", "oy-vay",
             "pew", "pee-yew", "pff", "pffh", "pssh", "pfft",
             "phew", "psst", "sheesh", "jeez", "shh", "hush",
             "shush", "shoo", "tsk-tsk", "tut-tut", "uh-hu", "uhuh",
             "mhm", "uh-oh", "oh-oh", "uh-uh", "unh-unh", "uhh",
             "uhm", "err", "wee", "whee", "weee", "whoa",
             "wow", "yahoo", "yippie", "yay", "yeah", "yeeeeaah",
             "yee-haw", "yeehaw", "yoo-hoo", "yoohoo", "yuh-uh", "yuh-hu",
             "yuhuh", "yuck", "ich", "blech", "bleh", "zing",
             "ba-dum-tss", "badum-tish"
         ).map(_.toLowerCase)

     // The acronyms stand for (Left|Right) (Round|Square|Curly) Bracket.
     // http://www.cis.upenn.edu/~treebank/tokenization.html
     private final val BRACKETS = Map(
         "-LRB-" → "(",
         "-RRB-" → ")",
         "-LSB-" → "[",
         "-RSB-" → "]",
         "-LCB-" → "{",
         "-RCB-" → "}"
     )

     @volatile private var parser: NCNlpParser = _

     /**
      *
      * @param parent Optional parent span.
      * @return
      */
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
         ackStarting()

         parser = NCNlpServerManager.getParser

         ackStarted()
     }

     /**
      *
      * @param parent Optional parent span.
      */
     override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
         ackStopping()
         ackStopped()
     }

     /**
      *
      * @param ns NLP sentence to enrich.
      * @param parent Optional parent span.
      * @throws NCE
      */
     @throws[NCE]
     override def enrich(ns: NCNlpSentence, parent: Span = null) {
         require(isStarted)

         startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, "txt" → ns.text) { _ ⇒
             // This must be 1st enricher in the pipeline.
             assume(ns.isEmpty)

             var idx = 0

             for (word ← parser.parse(ns.text)) {
                 val value = word.word.toLowerCase
                 val origTxt = word.word
                 val tok = NCNlpSentenceToken(idx)

                 // Override interjection (UH) analysis.
                 // (INTERJECTIONS and lemma are should be in lowercase.)
                 val pos = if (INTERJECTIONS.contains(word.lemma)) "UH" else word.pos
                 val seq = mutable.ArrayBuffer(
                     "lemma" → processBracket(word.lemma),
                     "index" → idx,
                     "pos" → pos,
                     "origText" → processBracket(origTxt),
                     "normText" → processBracket(value),
                     "charLength" → value.length,
                     "stem" → processBracket(word.stem),
                     "posDesc" → NCPennTreebank.description(pos).getOrElse(pos),
                     "start" → word.start,
                     "end" → word.end,
                     "quoted" → false,
                     "stopWord" → false,
                     "bracketed" → false,
                     "direct" → true
                 )

                 tok.add(NCNlpSentenceNote(Seq(idx), "nlpcraft:nlp", seq: _*))

                 // Add new token to NLP sentence.
                 ns += tok
                 idx += 1
             }
         }
     }

     private def processBracket(s: String): String = BRACKETS.getOrElse(s.toUpperCase, s)
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nlpcraft.server.nlp.enrichers.basenlp

	import io.opencensus.trace.Span
	import org.apache.nlpcraft.common._
	import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
	import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
	import org.apache.nlpcraft.server.nlp.core.{NCNlpParser, NCNlpServerManager}
	import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher

	import scala.collection._

	/**
	* Base NLP enricher.
	*/
	object NCBaseNlpEnricher extends NCServerEnricher {
	//noinspection SpellCheckingInspection
	private final val INTERJECTIONS =
	// http://www.vidarholen.net/contents/interjections/
	Set(
	"aah", "aaah", "aaaahh", "aha", "a-ha", "ahem",
	"ahh", "ahhh", "argh", "augh", "aww", "aw",
	"awww", "aww", "aw", "ohh", "ahh", "aw",
	"oh", "bah", "boo", "booh", "brr", "brrrr",
	"duh", "eek", "eeeek", "eep", "eh", "huh",
	"eh", "huh", "eyh", "eww", "ugh", "ewww",
	"gah", "gee", "grr", "grrrr", "hmm", "hm",
	"hmmmm", "humph", "harumph", "huh", "hurrah", "hooray",
	"huzzah", "ich", "yuck", "yak", "meh", "eh",
	"mhm", "mmhm", "uh-hu", "mm", "mmm", "mmh",
	"muahaha", "mwahaha", "bwahaha", "nah", "nuh-uh", "nuh-hu",
	"nuhuh", "oh", "ooh-la-la", "oh-lala", "ooh", "oooh",
	"oomph", "umph", "oops", "ow", "oww", "ouch",
	"oy", "oi", "oyh", "oy", "oyvay", "oy-vay",
	"pew", "pee-yew", "pff", "pffh", "pssh", "pfft",
	"phew", "psst", "sheesh", "jeez", "shh", "hush",
	"shush", "shoo", "tsk-tsk", "tut-tut", "uh-hu", "uhuh",
	"mhm", "uh-oh", "oh-oh", "uh-uh", "unh-unh", "uhh",
	"uhm", "err", "wee", "whee", "weee", "whoa",
	"wow", "yahoo", "yippie", "yay", "yeah", "yeeeeaah",
	"yee-haw", "yeehaw", "yoo-hoo", "yoohoo", "yuh-uh", "yuh-hu",
	"yuhuh", "yuck", "ich", "blech", "bleh", "zing",
	"ba-dum-tss", "badum-tish"
	).map(_.toLowerCase)

	// The acronyms stand for (Left\|Right) (Round\|Square\|Curly) Bracket.
	// http://www.cis.upenn.edu/~treebank/tokenization.html
	private final val BRACKETS = Map(
	"-LRB-" → "(",
	"-RRB-" → ")",
	"-LSB-" → "[",
	"-RSB-" → "]",
	"-LCB-" → "{",
	"-RCB-" → "}"
	)

	@volatile private var parser: NCNlpParser = _

	/**
	*
	* @param parent Optional parent span.
	* @return
	*/
	override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
	ackStarting()

	parser = NCNlpServerManager.getParser

	ackStarted()
	}

	/**
	*
	* @param parent Optional parent span.
	*/
	override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
	ackStopping()
	ackStopped()
	}

	/**
	*
	* @param ns NLP sentence to enrich.
	* @param parent Optional parent span.
	* @throws NCE
	*/
	@throws[NCE]
	override def enrich(ns: NCNlpSentence, parent: Span = null) {
	require(isStarted)

	startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, "txt" → ns.text) { _ ⇒
	// This must be 1st enricher in the pipeline.
	assume(ns.isEmpty)

	var idx = 0

	for (word ← parser.parse(ns.text)) {
	val value = word.word.toLowerCase
	val origTxt = word.word
	val tok = NCNlpSentenceToken(idx)

	// Override interjection (UH) analysis.
	// (INTERJECTIONS and lemma are should be in lowercase.)
	val pos = if (INTERJECTIONS.contains(word.lemma)) "UH" else word.pos
	val seq = mutable.ArrayBuffer(
	"lemma" → processBracket(word.lemma),
	"index" → idx,
	"pos" → pos,
	"origText" → processBracket(origTxt),
	"normText" → processBracket(value),
	"charLength" → value.length,
	"stem" → processBracket(word.stem),
	"posDesc" → NCPennTreebank.description(pos).getOrElse(pos),
	"start" → word.start,
	"end" → word.end,
	"quoted" → false,
	"stopWord" → false,
	"bracketed" → false,
	"direct" → true
	)

	tok.add(NCNlpSentenceNote(Seq(idx), "nlpcraft:nlp", seq: _*))

	// Add new token to NLP sentence.
	ns += tok
	idx += 1
	}
	}
	}

	private def processBracket(s: String): String = BRACKETS.getOrElse(s.toUpperCase, s)
	}