nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/validate/NCValidateManager.scala - incubator-nlpcraft - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nlpcraft.probe.mgrs.nlp.validate

 import com.typesafe.scalalogging.LazyLogging
 import io.opencensus.trace.Span
 import org.apache.tika.langdetect.OptimaizeLangDetector
 import org.apache.nlpcraft.common.NCService
 import org.apache.nlpcraft.common.nlp.NCNlpSentence
 import org.apache.nlpcraft.probe.mgrs.NCProbeModel

 /**
  * Probe pre/post enrichment validator.
  */
 object NCValidateManager extends NCService with LazyLogging {
     // Create new language finder singleton.
     @volatile private var langFinder: OptimaizeLangDetector = _

     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
         langFinder = new OptimaizeLangDetector()

         // Initialize language finder.
         langFinder.loadModels()

         super.start()
     }

     override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
         super.stop()
     }

     /**
      *
      * @param w Model decorator.
      * @param ns Sentence to validate.
      * @param parent Parent tracing span.
      */
     @throws[NCValidateException]
     def preValidate(w: NCProbeModel, ns: NCNlpSentence, parent: Span = null): Unit =
         startScopedSpan("validate", parent,
             "srvReqId" → ns.srvReqId,
             "txt" → ns.text,
             "mdlId" → w.model.getId) { _ ⇒
             val mdl = w.model

             if (!mdl.isNotLatinCharsetAllowed && !ns.text.matches("""[\s\w\p{Punct}]+"""))
                 throw NCValidateException("ALLOW_NON_LATIN_CHARSET")
             if (!mdl.isNonEnglishAllowed && !langFinder.detect(ns.text).isLanguage("en"))
                 throw NCValidateException("ALLOW_NON_ENGLISH")
             if (!mdl.isNoNounsAllowed && !ns.exists(_.pos.startsWith("n")))
                 throw NCValidateException("ALLOW_NO_NOUNS")
             if (mdl.getMinWords > ns.map(_.wordLength).sum)
                 throw NCValidateException("MIN_WORDS")
             if (ns.size > mdl.getMaxTokens)
                 throw NCValidateException("MAX_TOKENS")
         }

     /**
      *
      * @param w Model decorator.
      * @param ns Sentence to validate.
      * @param parent Optional parent span.
      */
     @throws[NCValidateException]
     def postValidate(w: NCProbeModel, ns: NCNlpSentence, parent: Span = null): Unit =
         startScopedSpan("validate", parent,
             "srvReqId" → ns.srvReqId,
             "txt" → ns.text,
             "mdlId" → w.model.getId) { _ ⇒
             val mdl = w.model
             val types = ns.flatten.filter(!_.isNlp).map(_.noteType).distinct
             val overlapNotes = ns.map(tkn ⇒ types.flatMap(tp ⇒ tkn.getNotes(tp))).filter(_.size > 1).flatten

             if (overlapNotes.nonEmpty)
                 throw NCValidateException("OVERLAP_NOTES")
             if (!mdl.isNoUserTokensAllowed && !ns.exists(_.exists(!_.noteType.startsWith("nlpcraft:"))))
                 throw NCValidateException("ALLOW_NO_USER_TOKENS")
             if (!mdl.isSwearWordsAllowed && ns.exists(_.getNlpValueOpt[Boolean]("swear").getOrElse(false)))
                 throw NCValidateException("ALLOW_SWEAR_WORDS")
             if (mdl.getMinNonStopwords > ns.count(!_.isStopWord))
                 throw NCValidateException("MIN_NON_STOPWORDS")
             if (mdl.getMinTokens > ns.size)
                 throw NCValidateException("MIN_TOKENS")
             if (mdl.getMaxUnknownWords < ns.count(t ⇒ t.isNlp && !t.isSynthetic && !t.isKnownWord))
                 throw NCValidateException("MAX_UNKNOWN_WORDS")
             if (mdl.getMaxSuspiciousWords < ns.count(_.getNlpValueOpt[Boolean]("suspNoun").getOrElse(false)))
                 throw NCValidateException("MAX_SUSPICIOUS_WORDS")
             if (mdl.getMaxFreeWords < ns.count(_.isNlp))
                 throw NCValidateException("MAX_FREE_WORDS")
         }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nlpcraft.probe.mgrs.nlp.validate

	import com.typesafe.scalalogging.LazyLogging
	import io.opencensus.trace.Span
	import org.apache.tika.langdetect.OptimaizeLangDetector
	import org.apache.nlpcraft.common.NCService
	import org.apache.nlpcraft.common.nlp.NCNlpSentence
	import org.apache.nlpcraft.probe.mgrs.NCProbeModel

	/**
	* Probe pre/post enrichment validator.
	*/
	object NCValidateManager extends NCService with LazyLogging {
	// Create new language finder singleton.
	@volatile private var langFinder: OptimaizeLangDetector = _

	override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
	langFinder = new OptimaizeLangDetector()

	// Initialize language finder.
	langFinder.loadModels()

	super.start()
	}

	override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
	super.stop()
	}

	/**
	*
	* @param w Model decorator.
	* @param ns Sentence to validate.
	* @param parent Parent tracing span.
	*/
	@throws[NCValidateException]
	def preValidate(w: NCProbeModel, ns: NCNlpSentence, parent: Span = null): Unit =
	startScopedSpan("validate", parent,
	"srvReqId" → ns.srvReqId,
	"txt" → ns.text,
	"mdlId" → w.model.getId) { _ ⇒
	val mdl = w.model

	if (!mdl.isNotLatinCharsetAllowed && !ns.text.matches("""[\s\w\p{Punct}]+"""))
	throw NCValidateException("ALLOW_NON_LATIN_CHARSET")
	if (!mdl.isNonEnglishAllowed && !langFinder.detect(ns.text).isLanguage("en"))
	throw NCValidateException("ALLOW_NON_ENGLISH")
	if (!mdl.isNoNounsAllowed && !ns.exists(_.pos.startsWith("n")))
	throw NCValidateException("ALLOW_NO_NOUNS")
	if (mdl.getMinWords > ns.map(_.wordLength).sum)
	throw NCValidateException("MIN_WORDS")
	if (ns.size > mdl.getMaxTokens)
	throw NCValidateException("MAX_TOKENS")
	}

	/**
	*
	* @param w Model decorator.
	* @param ns Sentence to validate.
	* @param parent Optional parent span.
	*/
	@throws[NCValidateException]
	def postValidate(w: NCProbeModel, ns: NCNlpSentence, parent: Span = null): Unit =
	startScopedSpan("validate", parent,
	"srvReqId" → ns.srvReqId,
	"txt" → ns.text,
	"mdlId" → w.model.getId) { _ ⇒
	val mdl = w.model
	val types = ns.flatten.filter(!_.isNlp).map(_.noteType).distinct
	val overlapNotes = ns.map(tkn ⇒ types.flatMap(tp ⇒ tkn.getNotes(tp))).filter(_.size > 1).flatten

	if (overlapNotes.nonEmpty)
	throw NCValidateException("OVERLAP_NOTES")
	if (!mdl.isNoUserTokensAllowed && !ns.exists(_.exists(!_.noteType.startsWith("nlpcraft:"))))
	throw NCValidateException("ALLOW_NO_USER_TOKENS")
	if (!mdl.isSwearWordsAllowed && ns.exists(_.getNlpValueOpt[Boolean]("swear").getOrElse(false)))
	throw NCValidateException("ALLOW_SWEAR_WORDS")
	if (mdl.getMinNonStopwords > ns.count(!_.isStopWord))
	throw NCValidateException("MIN_NON_STOPWORDS")
	if (mdl.getMinTokens > ns.size)
	throw NCValidateException("MIN_TOKENS")
	if (mdl.getMaxUnknownWords < ns.count(t ⇒ t.isNlp && !t.isSynthetic && !t.isKnownWord))
	throw NCValidateException("MAX_UNKNOWN_WORDS")
	if (mdl.getMaxSuspiciousWords < ns.count(_.getNlpValueOpt[Boolean]("suspNoun").getOrElse(false)))
	throw NCValidateException("MAX_SUSPICIOUS_WORDS")
	if (mdl.getMaxFreeWords < ns.count(_.isNlp))
	throw NCValidateException("MAX_FREE_WORDS")
	}
	}