nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCCustomWord.java - incubator-nlpcraft - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nlpcraft.model;

 /**
  * A partially enriched token with a basic set of NLP properties used by custom NER parser.
  *
  * @see NCModel#getParsers()
  * @see NCToken
  */
 public interface NCCustomWord {
     /**
      * Gets normalized text for this word.
      *
      * @return Normalized text.
      */
     String getNormalizedText();

     /**
      * Gets original text for this word.
      *
      * @return Original text.
      */
     String getOriginalText();

     /**
      * Gets start character index of this word in the original text.
      *
      * @return Start character index of this word.
      */
     int getStartCharIndex();

     /**
      * Gets end character index of this word in the original text.
      *
      * @return End character index of this word.
      */
     int getEndCharIndex();

     /**
      * Gets Penn Treebank POS tag for this word. Note that additionally to standard Penn Treebank POS
      * tags NLPCraft introduces {@code '---'} synthetic tag to indicate a POS tag for multi-word part.
      * Learn more at <a href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html">http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html</a>
      *
      * @return Penn Treebank POS tag for this word.
      */
     String getPos();

     /**
      * Gets description of Penn Treebank POS tag. Learn more at <a href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html">http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html</a>
      *
      * @return Description of Penn Treebank POS tag.
      */
     String getPosDescription();

     /**
      * Gets the lemma of this word, a canonical form of this word. Note that stemming and lemmatization
      * allow to reduce inflectional forms and sometimes derivationally related forms of a word to a
      * common base form. Lemmatization refers to the use of a vocabulary and morphological analysis
      * of words, normally aiming to remove inflectional endings only and to return the base or dictionary
      * form of a word, which is known as the lemma. Learn
      * more at <a href="https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html">https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html</a>     *
      *
      * @return Lemma of this word.
      */
     String getLemma();

     /**
      * Gets the stem of this word. Note that stemming and lemmatization allow to reduce inflectional forms
      * and sometimes derivationally related forms of a word to a common base form. Unlike lemma,
      * stemming is a basic heuristic process that chops off the ends of words in the hope of achieving
      * this goal correctly most of the time, and often includes the removal of derivational affixes.
      * Learn more at <a href="https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html">https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html</a>
      *
      * @return Stem of this word.
      */
     String getStem();

     /**
      * Gets whether or not this word is a stopword. Stopwords are some extremely common words which
      * add little value in helping understanding user input and are excluded from the processing
      * entirely. For example, words like {@code a, the, can, of, about, over}, etc. are typical
      * stopwords in English. NLPCraft has built-in set of stopwords. Each model can also
      * provide its own set of included and excluded stopwords.
      *
      * @return Whether or not this word is a stopword.
      */
     boolean isStopWord();

     /**
      * Gets whether or not this word is surrounded by any of {@code '[', ']', '{', '}', '(', ')'} brackets.
      *
      * @return Whether or not this word is surrounded by any of {@code '[', ']', '{', '}', '(', ')'} brackets.
      */
     boolean isBracketed();

     /**
      * Gets whether or not this word is surrounded by single or double quotes.
      *
      * @return Whether or not this word is surrounded by single or double quotes.
      */
     boolean isQuoted();

     /**
      * Tests whether or not this token is found in Princeton WordNet database.
      *
      * @return Princeton WordNet database inclusion flag.
      */
     boolean isKnownWord();

     /**
      * Tests whether or not the given token is a swear word. NLPCraft has built-in list of common English swear words.
      *
      * @return Swear word flag.
      */
     boolean isSwearWord();

     /**
      * Tests whether the given token represents an English word. Note that this only checks that token's text
      * consists of characters of English alphabet, i.e. the text doesn't have to be necessary
      * a known valid English word.
      *
      * @return Whether this token represents an English word.
      */
     boolean isEnglish();
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nlpcraft.model;

	/**
	* A partially enriched token with a basic set of NLP properties used by custom NER parser.
	*
	* @see NCModel#getParsers()
	* @see NCToken
	*/
	public interface NCCustomWord {
	/**
	* Gets normalized text for this word.
	*
	* @return Normalized text.
	*/
	String getNormalizedText();

	/**
	* Gets original text for this word.
	*
	* @return Original text.
	*/
	String getOriginalText();

	/**
	* Gets start character index of this word in the original text.
	*
	* @return Start character index of this word.
	*/
	int getStartCharIndex();

	/**
	* Gets end character index of this word in the original text.
	*
	* @return End character index of this word.
	*/
	int getEndCharIndex();

	/**
	* Gets Penn Treebank POS tag for this word. Note that additionally to standard Penn Treebank POS
	* tags NLPCraft introduces {@code '---'} synthetic tag to indicate a POS tag for multi-word part.
	* Learn more at <a href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html">http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html</a>
	*
	* @return Penn Treebank POS tag for this word.
	*/
	String getPos();

	/**
	* Gets description of Penn Treebank POS tag. Learn more at <a href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html">http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html</a>
	*
	* @return Description of Penn Treebank POS tag.
	*/
	String getPosDescription();

	/**
	* Gets the lemma of this word, a canonical form of this word. Note that stemming and lemmatization
	* allow to reduce inflectional forms and sometimes derivationally related forms of a word to a
	* common base form. Lemmatization refers to the use of a vocabulary and morphological analysis
	* of words, normally aiming to remove inflectional endings only and to return the base or dictionary
	* form of a word, which is known as the lemma. Learn
	* more at <a href="https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html">https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html</a> *
	*
	* @return Lemma of this word.
	*/
	String getLemma();

	/**
	* Gets the stem of this word. Note that stemming and lemmatization allow to reduce inflectional forms
	* and sometimes derivationally related forms of a word to a common base form. Unlike lemma,
	* stemming is a basic heuristic process that chops off the ends of words in the hope of achieving
	* this goal correctly most of the time, and often includes the removal of derivational affixes.
	* Learn more at <a href="https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html">https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html</a>
	*
	* @return Stem of this word.
	*/
	String getStem();

	/**
	* Gets whether or not this word is a stopword. Stopwords are some extremely common words which
	* add little value in helping understanding user input and are excluded from the processing
	* entirely. For example, words like {@code a, the, can, of, about, over}, etc. are typical
	* stopwords in English. NLPCraft has built-in set of stopwords. Each model can also
	* provide its own set of included and excluded stopwords.
	*
	* @return Whether or not this word is a stopword.
	*/
	boolean isStopWord();

	/**
	* Gets whether or not this word is surrounded by any of {@code '[', ']', '{', '}', '(', ')'} brackets.
	*
	* @return Whether or not this word is surrounded by any of {@code '[', ']', '{', '}', '(', ')'} brackets.
	*/
	boolean isBracketed();

	/**
	* Gets whether or not this word is surrounded by single or double quotes.
	*
	* @return Whether or not this word is surrounded by single or double quotes.
	*/
	boolean isQuoted();

	/**
	* Tests whether or not this token is found in Princeton WordNet database.
	*
	* @return Princeton WordNet database inclusion flag.
	*/
	boolean isKnownWord();

	/**
	* Tests whether or not the given token is a swear word. NLPCraft has built-in list of common English swear words.
	*
	* @return Swear word flag.
	*/
	boolean isSwearWord();

	/**
	* Tests whether the given token represents an English word. Note that this only checks that token's text
	* consists of characters of English alphabet, i.e. the text doesn't have to be necessary
	* a known valid English word.
	*
	* @return Whether this token represents an English word.
	*/
	boolean isEnglish();
	}