nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/core/NCNlpCoreManager.scala - incubator-nlpcraft - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nlpcraft.common.nlp.core

 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common.config.NCConfigurable
 import org.apache.nlpcraft.common.{NCE, NCService, _}

 /**
  *  NLP core manager.
  */
 object NCNlpCoreManager extends NCService {
     private final val SUPPORTED_NLP_ENGINES = Seq("opennlp", "stanford")

     private object Config extends NCConfigurable {
         def engine: String = getString("nlpcraft.nlpEngine")
     }

     @volatile private var tokenizer: NCNlpTokenizer = _

     /**
       *
       * @return
       */
     def getEngine: String = Config.engine

     /**
      *
      * @param parent Optional parent span.
      * @return
      */
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { span ⇒
         // NOTE: DO NOT confuse this with token providers.
         if (!SUPPORTED_NLP_ENGINES.contains(Config.engine))
             throw new NCE(s"Unsupported NLP engine: ${Config.engine}")

         addTags(span, "nlpEngine" → Config.engine)

         logger.info(s"NLP engine configured: ${Config.engine}")

         val factory: NCNlpTokenizerFactory =
             Config.engine match {
                 case "stanford" ⇒ U.mkObject("org.apache.nlpcraft.common.nlp.core.stanford.NCStanfordTokenizerFactory")
                 case "opennlp" ⇒ U.mkObject("org.apache.nlpcraft.common.nlp.core.opennlp.NCOpenNlpTokenizerFactory")

                 case _ ⇒ throw new AssertionError(s"Unexpected engine: ${Config.engine}")
             }

         tokenizer = factory.mkTokenizer()

         tokenizer.start()

         ackStart()
     }

     /**
      *
      * @param parent Optional parent span.
      */
     override def stop(parent: Span): Unit = startScopedSpan("stop", parent) {span ⇒
         if (tokenizer != null)
             tokenizer.stop(span)

         ackStop()
     }

     /**
       * Stems given word or a sequence of words which will be tokenized before.
       *
       * @param words One or more words to stemmatize.
       * @return Sentence with stemmed words.
       */
     def stem(words: String): String = {
         val seq = tokenizer.tokenize(words).map(p ⇒ p → NCNlpPorterStemmer.stem(p.token))

         seq.zipWithIndex.map { case ((tok, stem), idx) ⇒
             idx match {
                 case 0 ⇒ stem
                 // Suppose there aren't multiple spaces.
                 case _ ⇒ if (seq(idx - 1)._1.to + 1 < tok.from) s" $stem"
                 else stem
             }
         }.mkString(" ")
     }

     /**
       * Stems given word.
       *
       * @param word Word to stemmatize.
       * @return Stemmed word.
       */
     def stemWord(word: String): String = NCNlpPorterStemmer.stem(word)

     /**
       * Tokenizes given sentence.
       *
       * @param sen Sentence text.
       * @return Tokens.
       */
     def tokenize(sen: String): Seq[NCNlpCoreToken] = tokenizer.tokenize(sen)
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nlpcraft.common.nlp.core

	import io.opencensus.trace.Span
	import org.apache.nlpcraft.common.config.NCConfigurable
	import org.apache.nlpcraft.common.{NCE, NCService, _}

	/**
	* NLP core manager.
	*/
	object NCNlpCoreManager extends NCService {
	private final val SUPPORTED_NLP_ENGINES = Seq("opennlp", "stanford")

	private object Config extends NCConfigurable {
	def engine: String = getString("nlpcraft.nlpEngine")
	}

	@volatile private var tokenizer: NCNlpTokenizer = _

	/**
	*
	* @return
	*/
	def getEngine: String = Config.engine

	/**
	*
	* @param parent Optional parent span.
	* @return
	*/
	override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { span ⇒
	// NOTE: DO NOT confuse this with token providers.
	if (!SUPPORTED_NLP_ENGINES.contains(Config.engine))
	throw new NCE(s"Unsupported NLP engine: ${Config.engine}")

	addTags(span, "nlpEngine" → Config.engine)

	logger.info(s"NLP engine configured: ${Config.engine}")

	val factory: NCNlpTokenizerFactory =
	Config.engine match {
	case "stanford" ⇒ U.mkObject("org.apache.nlpcraft.common.nlp.core.stanford.NCStanfordTokenizerFactory")
	case "opennlp" ⇒ U.mkObject("org.apache.nlpcraft.common.nlp.core.opennlp.NCOpenNlpTokenizerFactory")

	case _ ⇒ throw new AssertionError(s"Unexpected engine: ${Config.engine}")
	}

	tokenizer = factory.mkTokenizer()

	tokenizer.start()

	ackStart()
	}

	/**
	*
	* @param parent Optional parent span.
	*/
	override def stop(parent: Span): Unit = startScopedSpan("stop", parent) {span ⇒
	if (tokenizer != null)
	tokenizer.stop(span)

	ackStop()
	}

	/**
	* Stems given word or a sequence of words which will be tokenized before.
	*
	* @param words One or more words to stemmatize.
	* @return Sentence with stemmed words.
	*/
	def stem(words: String): String = {
	val seq = tokenizer.tokenize(words).map(p ⇒ p → NCNlpPorterStemmer.stem(p.token))

	seq.zipWithIndex.map { case ((tok, stem), idx) ⇒
	idx match {
	case 0 ⇒ stem
	// Suppose there aren't multiple spaces.
	case _ ⇒ if (seq(idx - 1)._1.to + 1 < tok.from) s" $stem"
	else stem
	}
	}.mkString(" ")
	}

	/**
	* Stems given word.
	*
	* @param word Word to stemmatize.
	* @return Stemmed word.
	*/
	def stemWord(word: String): String = NCNlpPorterStemmer.stem(word)

	/**
	* Tokenizes given sentence.
	*
	* @param sen Sentence text.
	* @return Tokens.
	*/
	def tokenize(sen: String): Seq[NCNlpCoreToken] = tokenizer.tokenize(sen)
	}