nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/preproc/NCPreProcessManager.scala - incubator-nlpcraft - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nlpcraft.server.nlp.preproc

 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common._
 import org.apache.nlpcraft.server.nlp.spell.NCSpellCheckManager

 import scala.collection._

 /**
   * Centralized pre-processor for raw text coming from user.
   */
 object NCPreProcessManager extends NCService {
     // List of unambiguous contractions.
     private final val CONTRACTIONS: Map[String, Seq[String]] =
         Map[String, String](
             "aren't" → "are not",
             "can't" → "cannot",
             "aren't" → "are not",
             "can't" → "cannot",
             "could've" → "could have",
             "couldn't" → "could not",
             "didn't" → "did not",
             "doesn't" → "does not",
             "don't" → "do not",
             "hadn't" → "had not",
             "hasn't" → "has not",
             "haven't" → "have not",
             "he'll" → "he will",
             "how'd" → "how did",
             "how'll" → "how will",
             "i'll" → "I will",
             "i'm" → "I am",
             "i've" → "I have",
             "isn't" → "is not",
             "it'll" → "it will",
             "let's" → "let us",
             "ma'am" → "madam",
             "might've" → "might have",
             "must've" → "must have",
             "needn't" → "need not",
             "o'clock" → "of the clock",
             "shan't" → "shall not",
             "she'll" → "she will",
             "should've" → "should have",
             "shouldn't" → "should not",
             "they'll" → "they will",
             "they're" → "they are",
             "they've" → "they have",
             "wasn't" → "was not",
             "we'll" → "we will",
             "we're" → "we are",
             "we've" → "we have",
             "weren't" → "were not",
             "what'll" → "what will",
             "what're" → "what are",
             "where'd" → "where did",
             "where've" → "where have",
             "who'll" → "who will",
             "won't" → "will not",
             "would've" → "would have",
             "wouldn't" → "would not",
             "y'all" → "you are all",
             "you'll" → "you will",
             "you're" → "you are",
             "you've" → "you have"
         ).map(p ⇒ p._1 → p._2.split(' ').toSeq)

     /**
      *
      * @param parent Optional parent span.
      * @return
      */
     override def start(parent: Span): NCService = startScopedSpan("start", parent) { _ ⇒
         ackStarting()
         ackStarted()
     }

     /**
      *
      * @param parent Optional parent span.
      */
     override def stop(parent: Span): Unit = startScopedSpan("stop", parent) { _ ⇒
         ackStopping()
         ackStopped()
     }

     /**
       * Replaces contractions. Note that this method can change text case. It forces lower case on replacements.
       *
       * @param sen Input sentence.
       * @return
       */
     private def replaceContractions(sen: Seq[String]): Seq[String] =
         sen.flatMap(s ⇒ {
             CONTRACTIONS.get(s.toLowerCase) match {
                 case Some(seq) ⇒ seq
                 case None ⇒ Seq(s)
             }
         })

     /**
       *
       * @param sen Input sentence.
       * @param spellCheck Spell check flag.
       * @return
       */
     private def collect(sen: Seq[String], spellCheck: Boolean): String =
         if (spellCheck)
             U.trimFilter(sen.map(NCSpellCheckManager.check)).mkString(" ")
         else
             U.trimFilter(sen).mkString(" ")

     /**
       * Performs all pre-processing and normalizes the given input raw text.
       *
       * @param rawTxt Raw text to normalize.
       * @param spellCheck Using spell checking flag.
       * @return Normalized, pre-processed text.
       * @param parent Optional parent span.
       */
     def normalize(rawTxt: String, spellCheck: Boolean, parent: Span = null): String =
         startScopedSpan("normalize", parent, "txt" → rawTxt, "spellCheck" → spellCheck) { _ ⇒
             // Fix Apple/MacOS smart quotes & dashes.
             val s0 = rawTxt.trim().
                 replace('‘', '\'').
                 replace('’', '\'').
                 replace('”', '"').
                 replace('”', '"').
                 replace('—', '-')

             collect(
                 replaceContractions(
                     collect(
                         s0.split(' ').toSeq,
                         spellCheck
                     )
                         .split(' ').toSeq

                 ),
                 spellCheck = false
             )
         }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nlpcraft.server.nlp.preproc

	import io.opencensus.trace.Span
	import org.apache.nlpcraft.common._
	import org.apache.nlpcraft.server.nlp.spell.NCSpellCheckManager

	import scala.collection._

	/**
	* Centralized pre-processor for raw text coming from user.
	*/
	object NCPreProcessManager extends NCService {
	// List of unambiguous contractions.
	private final val CONTRACTIONS: Map[String, Seq[String]] =
	Map[String, String](
	"aren't" → "are not",
	"can't" → "cannot",
	"aren't" → "are not",
	"can't" → "cannot",
	"could've" → "could have",
	"couldn't" → "could not",
	"didn't" → "did not",
	"doesn't" → "does not",
	"don't" → "do not",
	"hadn't" → "had not",
	"hasn't" → "has not",
	"haven't" → "have not",
	"he'll" → "he will",
	"how'd" → "how did",
	"how'll" → "how will",
	"i'll" → "I will",
	"i'm" → "I am",
	"i've" → "I have",
	"isn't" → "is not",
	"it'll" → "it will",
	"let's" → "let us",
	"ma'am" → "madam",
	"might've" → "might have",
	"must've" → "must have",
	"needn't" → "need not",
	"o'clock" → "of the clock",
	"shan't" → "shall not",
	"she'll" → "she will",
	"should've" → "should have",
	"shouldn't" → "should not",
	"they'll" → "they will",
	"they're" → "they are",
	"they've" → "they have",
	"wasn't" → "was not",
	"we'll" → "we will",
	"we're" → "we are",
	"we've" → "we have",
	"weren't" → "were not",
	"what'll" → "what will",
	"what're" → "what are",
	"where'd" → "where did",
	"where've" → "where have",
	"who'll" → "who will",
	"won't" → "will not",
	"would've" → "would have",
	"wouldn't" → "would not",
	"y'all" → "you are all",
	"you'll" → "you will",
	"you're" → "you are",
	"you've" → "you have"
	).map(p ⇒ p._1 → p._2.split(' ').toSeq)

	/**
	*
	* @param parent Optional parent span.
	* @return
	*/
	override def start(parent: Span): NCService = startScopedSpan("start", parent) { _ ⇒
	ackStarting()
	ackStarted()
	}

	/**
	*
	* @param parent Optional parent span.
	*/
	override def stop(parent: Span): Unit = startScopedSpan("stop", parent) { _ ⇒
	ackStopping()
	ackStopped()
	}

	/**
	* Replaces contractions. Note that this method can change text case. It forces lower case on replacements.
	*
	* @param sen Input sentence.
	* @return
	*/
	private def replaceContractions(sen: Seq[String]): Seq[String] =
	sen.flatMap(s ⇒ {
	CONTRACTIONS.get(s.toLowerCase) match {
	case Some(seq) ⇒ seq
	case None ⇒ Seq(s)
	}
	})

	/**
	*
	* @param sen Input sentence.
	* @param spellCheck Spell check flag.
	* @return
	*/
	private def collect(sen: Seq[String], spellCheck: Boolean): String =
	if (spellCheck)
	U.trimFilter(sen.map(NCSpellCheckManager.check)).mkString(" ")
	else
	U.trimFilter(sen).mkString(" ")

	/**
	* Performs all pre-processing and normalizes the given input raw text.
	*
	* @param rawTxt Raw text to normalize.
	* @param spellCheck Using spell checking flag.
	* @return Normalized, pre-processed text.
	* @param parent Optional parent span.
	*/
	def normalize(rawTxt: String, spellCheck: Boolean, parent: Span = null): String =
	startScopedSpan("normalize", parent, "txt" → rawTxt, "spellCheck" → spellCheck) { _ ⇒
	// Fix Apple/MacOS smart quotes & dashes.
	val s0 = rawTxt.trim().
	replace('‘', '\'').
	replace('’', '\'').
	replace('”', '"').
	replace('”', '"').
	replace('—', '-')

	collect(
	replaceContractions(
	collect(
	s0.split(' ').toSeq,
	spellCheck
	)
	.split(' ').toSeq

	),
	spellCheck = false
	)
	}
	}