blob: 03ed3b00db0ceaf598a8399521d0efe93fd12c02 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nlpcraft.model;
/**
* A partially enriched token with a basic set of NLP properties used by custom NER parser.
*
* @see NCModel#getParsers()
* @see NCToken
*/
public interface NCCustomWord {
/**
* Gets normalized text for this word.
*
* @return Normalized text.
*/
String getNormalizedText();
/**
* Gets original text for this word.
*
* @return Original text.
*/
String getOriginalText();
/**
* Gets start character index of this word in the original text.
*
* @return Start character index of this word.
*/
int getStartCharIndex();
/**
* Gets end character index of this word in the original text.
*
* @return End character index of this word.
*/
int getEndCharIndex();
/**
* Gets Penn Treebank POS tag for this word. Note that additionally to standard Penn Treebank POS
* tags NLPCraft introduces {@code '---'} synthetic tag to indicate a POS tag for multi-word part.
* Learn more at <a href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html">http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html</a>
*
* @return Penn Treebank POS tag for this word.
*/
String getPos();
/**
* Gets description of Penn Treebank POS tag. Learn more at <a href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html">http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html</a>
*
* @return Description of Penn Treebank POS tag.
*/
String getPosDescription();
/**
* Gets the lemma of this word, a canonical form of this word. Note that stemming and lemmatization
* allow to reduce inflectional forms and sometimes derivationally related forms of a word to a
* common base form. Lemmatization refers to the use of a vocabulary and morphological analysis
* of words, normally aiming to remove inflectional endings only and to return the base or dictionary
* form of a word, which is known as the lemma. Learn
* more at <a href="https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html">https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html</a> *
*
* @return Lemma of this word.
*/
String getLemma();
/**
* Gets the stem of this word. Note that stemming and lemmatization allow to reduce inflectional forms
* and sometimes derivationally related forms of a word to a common base form. Unlike lemma,
* stemming is a basic heuristic process that chops off the ends of words in the hope of achieving
* this goal correctly most of the time, and often includes the removal of derivational affixes.
* Learn more at <a href="https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html">https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html</a>
*
* @return Stem of this word.
*/
String getStem();
/**
* Gets whether or not this word is a stopword. Stopwords are some extremely common words which
* add little value in helping understanding user input and are excluded from the processing
* entirely. For example, words like {@code a, the, can, of, about, over}, etc. are typical
* stopwords in English. NLPCraft has built-in set of stopwords. Each model can also
* provide its own set of included and excluded stopwords.
*
* @return Whether or not this word is a stopword.
*/
boolean isStopWord();
/**
* Gets whether or not this word is surrounded by any of {@code '[', ']', '{', '}', '(', ')'} brackets.
*
* @return Whether or not this word is surrounded by any of {@code '[', ']', '{', '}', '(', ')'} brackets.
*/
boolean isBracketed();
/**
* Gets whether or not this word is surrounded by single or double quotes.
*
* @return Whether or not this word is surrounded by single or double quotes.
*/
boolean isQuoted();
/**
* Tests whether or not this token is found in Princeton WordNet database.
*
* @return Princeton WordNet database inclusion flag.
*/
boolean isKnownWord();
/**
* Tests whether or not the given token is a swear word. NLPCraft has built-in list of common English swear words.
*
* @return Swear word flag.
*/
boolean isSwearWord();
/**
* Tests whether the given token represents an English word. Note that this only checks that token's text
* consists of characters of English alphabet, i.e. the text doesn't have to be necessary
* a known valid English word.
*
* @return Whether this token represents an English word.
*/
boolean isEnglish();
}