blob: c429f59f91557024a699f2617569337be23685c7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nlpcraft.model;
import java.util.*;
/**
* Detected model element. A token is a detected model element and is a part of the
* parsed user input. Sequence of tokens represents a fully parsed (see {@link NCContext#getVariants()} method) user input. A single
* token corresponds to a one or more words, sequential or not, in the user sentence.
* <p>
* Note that tokens can be used to define other tokens (i.e. tokens are composable). Because of that tokens naturally
* form a tree hierarchy - see methods {@link #findPartTokens(String...)}, {@link #getAliases()}, {@link #isOfAlias(String)}
* and {@link #getPartTokens()}.
* Note also that detected model elements that tokens represent also form another hierarchy, namely a model element hierarchy that
* user can also access via {@link #getAncestors()}, {@link #getParentId()} methods. These two hierarchies should be not
* be confused.
* <p>
* <b>Configuring Token Providers</b><br>
* Token providers (built-in or 3rd party) have to be enabled in the REST server <a href="https://nlpcraft.apache.org/server-and-probe.html">configuration</a>.
* Data models also have to specify tokens they are expecting the REST server and probe to detect. This is done to
* limit the unnecessary processing since implicit enabling of all token providers and all tokens can lead to a
* significant slow down of processing. REST server <a href="https://nlpcraft.apache.org/server-and-probe.html">configuration</a>
* property <code>nlpcraft.server.tokenProvides</code> provides the list of enabled token providers.
* Data models provide their required tokens in {@link NCModel#getEnabledBuiltInTokens()} method.
* <p>
* Read full documentation in <a target=_ href="https://nlpcraft.apache.org/data-model.html">Data Model</a> section and review
* <a target=_ href="https://github.com/apache/incubator-nlpcraft/tree/master/nlpcraft/src/main/scala/org/apache/nlpcraft/examples/">examples</a>.
*
* @see NCElement
* @see NCContext#getVariants()
*/
public interface NCToken extends NCMetadata {
/**
* Gets reference to the model this token belongs to.
*
* @return Model reference.
*/
NCModelView getModel();
/**
* Gets ID of the server request this token is part of.
*
* @return ID of the server request this token is part of.
*/
String getServerRequestId();
/**
* If this token represents user defined model element this method returns
* the ID of that element. Otherwise, it returns ID of the built-in system token.
* Note that a sentence can have multiple tokens with the same element ID.
*
* @return ID of the element (system or user defined).
* @see NCElement#getId()
*/
String getId();
/**
* Gets the optional parent ID of the model element this token represents. This only available
* for user-defined model elements (built-in tokens do not have parents).
*
* @return ID of the token's element immediate parent or {@code null} if not available.
* @see NCElement#getParentId()
* @see #getAncestors()
*/
String getParentId();
/**
* Gets the list of all parent IDs from this token up to the root. This only available
* for user-defined model elements (built-in tokens do not have parents).
*
* @return List, potentially empty but never {@code null}, of all parent IDs from this token up to the root.
* @see #getParentId()
*/
List<String> getAncestors();
/**
* Tests whether this token is a child of given token ID. It is equivalent to:
* <pre class="brush: java">
* return getAncestors().contains(tokId);
* </pre>
*
* @param tokId Ancestor token ID.
* @return <code>true</code> this token is a child of given token ID, <code>false</code> otherwise.
*/
default boolean isChildOf(String tokId) {
return getAncestors().contains(tokId);
}
/**
* Gets the list of tokens this tokens is composed of. This method returns only immediate part tokens.
*
* @return List of constituent tokens, potentially empty but never {@code null}, that this token is composed of.
* @see #findPartTokens(String...)
*/
List<NCToken> getPartTokens();
/**
* Gets the list of all part tokens with given IDs or aliases traversing entire part token graph.
*
* @param idOrAlias List of token IDs or aliases, potentially empty. If empty, the entire tree of part tokens
* is return as a list.
* @return List of all part tokens with given IDs or aliases. Potentially empty but never {@code null}.
* @see #getPartTokens()
*/
default List<NCToken> findPartTokens(String... idOrAlias) {
List<NCToken> parts = getPartTokens();
List<NCToken> list = new ArrayList<>();
if (idOrAlias.length == 0) {
if (!parts.isEmpty()) {
list.addAll(parts);
parts.forEach(p -> list.addAll(p.findPartTokens(idOrAlias)));
}
}
else {
// NOTE: re-sorting is fast enough on small arrays.
Arrays.sort(idOrAlias);
for (NCToken part : parts) {
// Check ID first.
boolean found = Arrays.binarySearch(idOrAlias, part.getId()) >= 0;
if (!found)
// Check aliases if not found by ID.
for (String alias : getAliases()) {
found = Arrays.binarySearch(idOrAlias, alias) >= 0;
if (found)
break;
}
if (found)
list.add(part);
// Recursive call for the part.
list.addAll(part.findPartTokens(idOrAlias));
}
}
return list;
}
/**
* Gets optional list of aliases this token is known by. Token can get an alias if it is a part of
* other composed token and token DSL expression that was used to match it specified an alias. Note
* that token can have zero, one or more aliases.
*
* @return List of aliases this token is known by. Can be empty, but never {@code null}.
*/
List<String> getAliases();
/**
* Tests whether or not this token has given alias. It is equivalent to:
* <pre class="brush: java">
* return getAliases().contains(alias);
* </pre>
*
* @param alias Alias to test.
* @return <code>True</code> if this token has alias <code>alias</code>, {@code false} otherwise.
*/
default boolean isOfAlias(String alias) {
return getAliases().contains(alias);
}
/**
* Gets the value if this token was detected via element's value (or its synonyms). Otherwise
* returns {@code null}. Only applicable for user-defined model elements (built-in tokens
* do not have values).
*
* @return Value for the user-defined model element or {@code null}, if not available.
* @see NCElement#getValues()
*/
String getValue();
/**
* Gets the list of groups this token belongs to. By default, if not specified explicitly, the group
* is token's ID.
*
* @return Token groups list. Never {@code null} - but can be empty.
* @see NCElement#getGroups()
*/
List<String> getGroups();
/**
* Tests whether or not this token belongs to the given group. It is equivalent to:
* <pre class="brush: java">
* return getGroups().contains(grp);
* </pre>
*
* @param grp Group to test.
* @return <code>True</code> if this token belongs to the group <code>grp</code>, {@code false} otherwise.
*/
default boolean isMemberOf(String grp) {
return getGroups().contains(grp);
}
/**
* Gets start character index of this token in the original text.
*
* @return Start character index of this token.
*/
int getStartCharIndex();
/**
* Gets end character index of this token in the original text.
*
* @return End character index of this token.
*/
int getEndCharIndex();
/**
* A shortcut method checking whether or not this token is a stopword. Stopwords are some extremely common
* words which add little value in helping understanding user input and are excluded from the
* processing entirely. For example, words like a, the, can, of, about, over, etc. are
* typical stopwords in English. NLPCraft has built-in set of stopwords.
* <p>
* This method is equivalent to:
* <pre class="brush: java">
* return meta("nlpcraft:nlp:stopword");
* </pre>
*
* @return Whether or not this token is a stopword.
*/
default boolean isStopWord() {
return meta("nlpcraft:nlp:stopword");
}
/**
* A shortcut method checking whether or not this token represents a free word. A free word is a
* token that was detected neither as a part of user defined or system tokens.
* <p>
* This method is equivalent to:
* <pre class="brush: java">
* return meta("nlpcraft:nlp:freeword");
* </pre>
*
* @return Whether or not this token is a freeword.
*/
default boolean isFreeWord() {
return meta("nlpcraft:nlp:freeword");
}
/**
* A shortcut method that gets original user input text for this token.
* <p>
* This method is equivalent to:
* <pre class="brush: java">
* return meta("nlpcraft:nlp:origtext");
* </pre>
*
* @return Original user input text for this token.
*/
default String getOriginalText() {
return meta("nlpcraft:nlp:origtext");
}
/**
* A shortcut method that gets index of this token in the sentence.
* <p>
* This method is equivalent to:
* <pre class="brush: java">
* return meta("nlpcraft:nlp:index");
* </pre>
*
* @return Index of this token in the sentence.
*/
default int getIndex() {
return meta("nlpcraft:nlp:index");
}
/**
* A shortcut method that gets internal globally unique system ID of the token.
* <p>
* This method is equivalent to:
* <pre class="brush: java">
* return meta("nlpcraft:nlp:unid");
* </pre>
*
* @return Internal globally unique system ID of the token.
*/
default String getUnid() {
return meta("nlpcraft:nlp:unid");
}
/**
* Tests whether or not this token is a user-defined token.
*
* @return {@code True} if this token is defined by the model element in the user model, {@code false} otherwise.
*/
default boolean isUserDefined() {
String id = getId();
int i = id.indexOf(':');
return i <= 0 || !"nlpcraft google opennlp spacy stanford".contains(id.substring(0, i));
}
/**
* Whether or not this token is abstract.
* <p>
* An abstract token is only detected when it is either a constituent part of some other non-abstract token
* or referenced by built-in tokens. In other words, an abstract token will not be detected in a standalone
* unreferenced position. By default (unless returned by this method), all named entities considered to be
* non-abstract.
* <p>
* Declaring tokens as abstract is important to minimize number of parsing variants automatically
* generated as permutation of all possible parsing compositions. For example, if it is known that a particular
* named entity will only be used as a constituent part of some other token - declaring such named entity as
* abstract can significantly reduce the number of parsing variants leading to a better performance,
* and often simpler corresponding intent definition and callback logic.
*
* @return Whether or not this token is abstract.
*/
boolean isAbstract();
}