blob: 624340e3b333d093a996c8511be7bb035691c72e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.fit.testing.factory;
import java.util.ArrayList;
import java.util.List;
import org.apache.uima.UIMAException;
import org.apache.uima.cas.Feature;
import org.apache.uima.fit.factory.AnnotationFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
/**
* This class provides convenience methods for creating tokens and sentences and add them to a
* {@link JCas}.
*
* @param <TOKEN_TYPE>
* the type system token type (e.g. {@code org.apache.uima.fit.examples.type.Token})
* @param <SENTENCE_TYPE>
* the type system sentence type (e.g. {@code org.apache.uima.fit.examples.type.Sentence})
*/
public class TokenBuilder<TOKEN_TYPE extends Annotation, SENTENCE_TYPE extends Annotation> {
private Class<TOKEN_TYPE> tokenClass;
private Class<SENTENCE_TYPE> sentenceClass;
private String posFeatureName;
private String stemFeatureName;
/**
* Calls {@link TokenBuilder#TokenBuilder(Class, Class, String, String)} with the last two
* arguments as null.
*
* @param aTokenClass
* the class of your token type from your type system (e.g.
* org.apache.uima.fit.type.Token.class)
* @param aSentenceClass
* the class of your sentence type from your type system (e.g.
* org.apache.uima.fit.type.Sentence.class)
*/
public TokenBuilder(Class<TOKEN_TYPE> aTokenClass, Class<SENTENCE_TYPE> aSentenceClass) {
this(aTokenClass, aSentenceClass, null, null);
}
/**
* Instantiates a TokenBuilder with the type system information that the builder needs to build
* tokens.
*
* @param aTokenClass
* the class of your token type from your type system (e.g.
* org.apache.uima.fit.type.Token.class)
* @param aSentenceClass
* the class of your sentence type from your type system (e.g.
* org.apache.uima.fit.type.Sentence.class)
* @param aPosFeatureName
* the feature name for the part-of-speech tag for your token type. This assumes that
* there is a single string feature for which to put your pos tag. null is an ok value.
* @param aStemFeatureName
* the feature name for the stem for your token type. This assumes that there is a single
* string feature for which to put your stem. null is an ok value.
*/
public TokenBuilder(final Class<TOKEN_TYPE> aTokenClass, final Class<SENTENCE_TYPE> aSentenceClass,
String aPosFeatureName, String aStemFeatureName) {
tokenClass = aTokenClass;
sentenceClass = aSentenceClass;
setPosFeatureName(aPosFeatureName);
setStemFeatureName(aStemFeatureName);
}
/**
* Instantiates a TokenBuilder with the type system information that the builder needs to build
* tokens.
*
* @param <T>
* the type system token type (e.g. org.apache.uima.fit.examples.type.Token)
* @param <S>
* the type system sentence type (e.g. {@code org.apache.uima.fit.examples.type.Sentence})
* @param aTokenClass
* the class of your token type from your type system (e.g.
* {@code org.apache.uima.fit.type.Token})
* @param aSentenceClass
* the class of your sentence type from your type system (e.g.
* {@code org.apache.uima.fit.type.Sentence})
* @return the builder.
*/
public static <T extends Annotation, S extends Annotation> TokenBuilder<T, S> create(
Class<T> aTokenClass, Class<S> aSentenceClass) {
return new TokenBuilder<T, S>(aTokenClass, aSentenceClass);
}
/**
* Set the feature name for the part-of-speech tag for your token type. This assumes that there is
* a single string feature for which to put your pos tag. null is an ok value.
*
* @param aPosFeatureName
* the part-of-speech feature name.
*/
public void setPosFeatureName(String aPosFeatureName) {
posFeatureName = aPosFeatureName;
}
/**
* Set the feature name for the stem for your token type. This assumes that there is a single
* string feature for which to put your stem. null is an ok value.
*
* @param aStemFeatureName
* the stem feature name.
*/
public void setStemFeatureName(String aStemFeatureName) {
stemFeatureName = aStemFeatureName;
}
/**
* Builds white-space delimited tokens from the input text.
*
* @param aJCas
* the JCas to add the Token annotations to
* @param aText
* the text to initialize the {@link JCas} with
*/
public void buildTokens(JCas aJCas, String aText) {
if (aText == null) {
throw new IllegalArgumentException("text may not be null.");
}
buildTokens(aJCas, aText, aText, null, null);
}
/**
* @param aJCas
* the JCas to add the Token annotations to
* @param aText
* the text to initialize the {@link JCas} with
* @param aTokensString
* the tokensString must have the same non-white space characters as the text. The
* tokensString is used to identify token boundaries using white space - i.e. the only
* difference between the 'text' parameter and the 'tokensString' parameter is that the
* latter may have more whitespace characters. For example, if the text is "She ran."
* then the tokensString might be "She ran ."
* @see #buildTokens(JCas, String, String, String, String)
*/
public void buildTokens(JCas aJCas, String aText, String aTokensString) {
if (aTokensString == null) {
throw new IllegalArgumentException("tokensString may not be null.");
}
buildTokens(aJCas, aText, aTokensString, null, null);
}
/**
* @param aJCas
* the JCas to add the Token annotations to
* @param aText
* the text to initialize the {@link JCas} with
* @param aTokensString
* the tokensString must have the same non-white space characters as the text. The
* tokensString is used to identify token boundaries using white space - i.e. the only
* difference between the 'text' parameter and the 'tokensString' parameter is that the
* latter may have more whitespace characters. For example, if the text is "She ran."
* then the tokensString might be "She ran ."
* @param aPosTagsString
* the posTagsString should be a space delimited string of part-of-speech tags - one for
* each token
* @see #buildTokens(JCas, String, String, String, String)
*/
public void buildTokens(JCas aJCas, String aText, String aTokensString, String aPosTagsString) {
buildTokens(aJCas, aText, aTokensString, aPosTagsString, null);
}
/**
* Build tokens for the given text, tokens, part-of-speech tags, and word stems.
*
* @param aJCas
* the JCas to add the Token annotations to
* @param aText
* the text to initialize the {@link JCas} with
* @param aTokensString
* the tokensString must have the same non-white space characters as the text. The
* tokensString is used to identify token boundaries using white space - i.e. the only
* difference between the 'text' parameter and the 'tokensString' parameter is that the
* latter may have more whitespace characters. For example, if the text is "She ran."
* then the tokensString might be "She ran ."
* @param aPosTagsString
* the posTagsString should be a space delimited string of part-of-speech tags - one for
* each token
* @param aStemsString
* the stemsString should be a space delimited string of stems - one for each token
*/
public void buildTokens(JCas aJCas, String aText, String aTokensString, String aPosTagsString,
String aStemsString) {
aJCas.setDocumentText(aText);
if (aPosTagsString != null && posFeatureName == null) {
throw new IllegalArgumentException("posTagsString must be null if TokenBuilder is "
+ "not initialized with a feature name corresponding to the part-of-speech "
+ "feature of the token type (assuming your token type has such a feature).");
}
if (aStemsString != null && stemFeatureName == null) {
throw new IllegalArgumentException("stemsString must be null if TokenBuilder is not "
+ "initialized with a feature name corresponding to the part-of-speech feature "
+ "of the token type (assuming your token type has such a feature).");
}
Feature posFeature = null;
if (posFeatureName != null) {
// String fullPosFeatureName = tokenClass.getClass().getName()+":"+posFeatureName;
// posFeature = jCas.getTypeSystem().getFeatureByFullName(fullPosFeatureName);
posFeature = aJCas.getTypeSystem().getType(tokenClass.getName())
.getFeatureByBaseName(posFeatureName);
}
Feature stemFeature = null;
if (stemFeatureName != null) {
stemFeature = aJCas.getTypeSystem().getType(tokenClass.getName())
.getFeatureByBaseName(stemFeatureName);
}
String tokensString = aTokensString.replaceAll("\\s*\n\\s*", "\n");
String[] sentenceStrings = tokensString.split("\n");
String[] posTags = aPosTagsString != null ? aPosTagsString.split("\\s+") : null;
String[] stems = aStemsString != null ? aStemsString.split("\\s+") : null;
int offset = 0;
int tokenIndex = 0;
for (String sentenceString : sentenceStrings) {
String[] tokenStrings = sentenceString.trim().split("\\s+");
List<Annotation> tokenAnnotations = new ArrayList<Annotation>();
for (String tokenString : tokenStrings) {
// move the offset up to the beginning of the token
while (!aText.startsWith(tokenString, offset)) {
offset++;
if (offset > aText.length()) {
throw new IllegalArgumentException(String.format("unable to find string %s",
tokenString));
}
}
// add the Token
int start = offset;
offset = offset + tokenString.length();
Annotation token = AnnotationFactory.createAnnotation(aJCas, start, offset, tokenClass);
tokenAnnotations.add(token);
// set the stem and part of speech if present
if (posTags != null) {
token.setStringValue(posFeature, posTags[tokenIndex]);
}
if (stems != null) {
token.setStringValue(stemFeature, stems[tokenIndex]);
}
tokenIndex++;
}
if (!tokenAnnotations.isEmpty()) {
int begin = tokenAnnotations.get(0).getBegin();
int end = tokenAnnotations.get(tokenAnnotations.size() - 1).getEnd();
AnnotationFactory.createAnnotation(aJCas, begin, end, sentenceClass);
}
}
}
}