uimafit-core/src/main/java/org/apache/uima/fit/testing/factory/TokenBuilder.java - uima-uimafit - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.uima.fit.testing.factory;

 import java.util.ArrayList;
 import java.util.List;

 import org.apache.uima.UIMAException;
 import org.apache.uima.cas.Feature;
 import org.apache.uima.fit.factory.AnnotationFactory;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;

 /**
  * This class provides convenience methods for creating tokens and sentences and add them to a
  * {@link JCas}.
  *
  * @param <TOKEN_TYPE>
  *          the type system token type (e.g. {@code org.apache.uima.fit.examples.type.Token})
  * @param <SENTENCE_TYPE>
  *          the type system sentence type (e.g. {@code org.apache.uima.fit.examples.type.Sentence})
  */

 public class TokenBuilder<TOKEN_TYPE extends Annotation, SENTENCE_TYPE extends Annotation> {
   private Class<TOKEN_TYPE> tokenClass;

   private Class<SENTENCE_TYPE> sentenceClass;

   private String posFeatureName;

   private String stemFeatureName;

   /**
    * Calls {@link TokenBuilder#TokenBuilder(Class, Class, String, String)} with the last two
    * arguments as null.
    *
    * @param aTokenClass
    *          the class of your token type from your type system (e.g.
    *          org.apache.uima.fit.type.Token.class)
    * @param aSentenceClass
    *          the class of your sentence type from your type system (e.g.
    *          org.apache.uima.fit.type.Sentence.class)
    */
   public TokenBuilder(Class<TOKEN_TYPE> aTokenClass, Class<SENTENCE_TYPE> aSentenceClass) {
     this(aTokenClass, aSentenceClass, null, null);
   }

   /**
    * Instantiates a TokenBuilder with the type system information that the builder needs to build
    * tokens.
    *
    * @param aTokenClass
    *          the class of your token type from your type system (e.g.
    *          org.apache.uima.fit.type.Token.class)
    * @param aSentenceClass
    *          the class of your sentence type from your type system (e.g.
    *          org.apache.uima.fit.type.Sentence.class)
    * @param aPosFeatureName
    *          the feature name for the part-of-speech tag for your token type. This assumes that
    *          there is a single string feature for which to put your pos tag. null is an ok value.
    * @param aStemFeatureName
    *          the feature name for the stem for your token type. This assumes that there is a single
    *          string feature for which to put your stem. null is an ok value.
    */
   public TokenBuilder(final Class<TOKEN_TYPE> aTokenClass, final Class<SENTENCE_TYPE> aSentenceClass,
           String aPosFeatureName, String aStemFeatureName) {
     tokenClass = aTokenClass;
     sentenceClass = aSentenceClass;
     setPosFeatureName(aPosFeatureName);
     setStemFeatureName(aStemFeatureName);
   }

   /**
    * Instantiates a TokenBuilder with the type system information that the builder needs to build
    * tokens.
    *
    * @param <T>
    *          the type system token type (e.g. org.apache.uima.fit.examples.type.Token)
    * @param <S>
    *          the type system sentence type (e.g. {@code org.apache.uima.fit.examples.type.Sentence})
    * @param aTokenClass
    *          the class of your token type from your type system (e.g.
    *          {@code org.apache.uima.fit.type.Token})
    * @param aSentenceClass
    *          the class of your sentence type from your type system (e.g.
    *          {@code org.apache.uima.fit.type.Sentence})
    * @return the builder.
    */
   public static <T extends Annotation, S extends Annotation> TokenBuilder<T, S> create(
           Class<T> aTokenClass, Class<S> aSentenceClass) {
     return new TokenBuilder<T, S>(aTokenClass, aSentenceClass);
   }

   /**
    * Set the feature name for the part-of-speech tag for your token type. This assumes that there is
    * a single string feature for which to put your pos tag. null is an ok value.
    *
    * @param aPosFeatureName
    *          the part-of-speech feature name.
    */
   public void setPosFeatureName(String aPosFeatureName) {
     posFeatureName = aPosFeatureName;
   }

   /**
    * Set the feature name for the stem for your token type. This assumes that there is a single
    * string feature for which to put your stem. null is an ok value.
    *
    * @param aStemFeatureName
    *          the stem feature name.
    */
   public void setStemFeatureName(String aStemFeatureName) {
     stemFeatureName = aStemFeatureName;
   }

   /**
    * Builds white-space delimited tokens from the input text.
    *
    * @param aJCas
    *          the JCas to add the Token annotations to
    * @param aText
    *          the text to initialize the {@link JCas} with
    */
   public void buildTokens(JCas aJCas, String aText) {
     if (aText == null) {
       throw new IllegalArgumentException("text may not be null.");
     }
     buildTokens(aJCas, aText, aText, null, null);
   }

   /**
    * @param aJCas
    *          the JCas to add the Token annotations to
    * @param aText
    *          the text to initialize the {@link JCas} with
    * @param aTokensString
    *          the tokensString must have the same non-white space characters as the text. The
    *          tokensString is used to identify token boundaries using white space - i.e. the only
    *          difference between the 'text' parameter and the 'tokensString' parameter is that the
    *          latter may have more whitespace characters. For example, if the text is "She ran."
    *          then the tokensString might be "She ran ."
    * @see #buildTokens(JCas, String, String, String, String)
    */
   public void buildTokens(JCas aJCas, String aText, String aTokensString) {
     if (aTokensString == null) {
       throw new IllegalArgumentException("tokensString may not be null.");
     }
     buildTokens(aJCas, aText, aTokensString, null, null);
   }

   /**
    * @param aJCas
    *          the JCas to add the Token annotations to
    * @param aText
    *          the text to initialize the {@link JCas} with
    * @param aTokensString
    *          the tokensString must have the same non-white space characters as the text. The
    *          tokensString is used to identify token boundaries using white space - i.e. the only
    *          difference between the 'text' parameter and the 'tokensString' parameter is that the
    *          latter may have more whitespace characters. For example, if the text is "She ran."
    *          then the tokensString might be "She ran ."
    * @param aPosTagsString
    *          the posTagsString should be a space delimited string of part-of-speech tags - one for
    *          each token
    * @see #buildTokens(JCas, String, String, String, String)
    */
   public void buildTokens(JCas aJCas, String aText, String aTokensString, String aPosTagsString) {
     buildTokens(aJCas, aText, aTokensString, aPosTagsString, null);
   }

   /**
    * Build tokens for the given text, tokens, part-of-speech tags, and word stems.
    *
    * @param aJCas
    *          the JCas to add the Token annotations to
    * @param aText
    *          the text to initialize the {@link JCas} with
    * @param aTokensString
    *          the tokensString must have the same non-white space characters as the text. The
    *          tokensString is used to identify token boundaries using white space - i.e. the only
    *          difference between the 'text' parameter and the 'tokensString' parameter is that the
    *          latter may have more whitespace characters. For example, if the text is "She ran."
    *          then the tokensString might be "She ran ."
    * @param aPosTagsString
    *          the posTagsString should be a space delimited string of part-of-speech tags - one for
    *          each token
    * @param aStemsString
    *          the stemsString should be a space delimited string of stems - one for each token
    */
   public void buildTokens(JCas aJCas, String aText, String aTokensString, String aPosTagsString,
           String aStemsString) {
     aJCas.setDocumentText(aText);

     if (aPosTagsString != null && posFeatureName == null) {
       throw new IllegalArgumentException("posTagsString must be null if TokenBuilder is "
               + "not initialized with a feature name corresponding to the part-of-speech "
               + "feature of the token type (assuming your token type has such a feature).");
     }

     if (aStemsString != null && stemFeatureName == null) {
       throw new IllegalArgumentException("stemsString must be null if TokenBuilder is not "
               + "initialized with a feature name corresponding to the part-of-speech feature "
               + "of the token type (assuming your token type has such a feature).");
     }

     Feature posFeature = null;
     if (posFeatureName != null) {
       // String fullPosFeatureName = tokenClass.getClass().getName()+":"+posFeatureName;
       // posFeature = jCas.getTypeSystem().getFeatureByFullName(fullPosFeatureName);
       posFeature = aJCas.getTypeSystem().getType(tokenClass.getName())
               .getFeatureByBaseName(posFeatureName);
     }
     Feature stemFeature = null;
     if (stemFeatureName != null) {
       stemFeature = aJCas.getTypeSystem().getType(tokenClass.getName())
               .getFeatureByBaseName(stemFeatureName);
     }

     String tokensString = aTokensString.replaceAll("\\s*\n\\s*", "\n");
     String[] sentenceStrings = tokensString.split("\n");
     String[] posTags = aPosTagsString != null ? aPosTagsString.split("\\s+") : null;
     String[] stems = aStemsString != null ? aStemsString.split("\\s+") : null;

     int offset = 0;
     int tokenIndex = 0;

     for (String sentenceString : sentenceStrings) {
       String[] tokenStrings = sentenceString.trim().split("\\s+");
       List<Annotation> tokenAnnotations = new ArrayList<Annotation>();
       for (String tokenString : tokenStrings) {
         // move the offset up to the beginning of the token
         while (!aText.startsWith(tokenString, offset)) {
           offset++;
           if (offset > aText.length()) {
             throw new IllegalArgumentException(String.format("unable to find string %s",
                     tokenString));
           }
         }

         // add the Token
         int start = offset;
         offset = offset + tokenString.length();
         Annotation token = AnnotationFactory.createAnnotation(aJCas, start, offset, tokenClass);
         tokenAnnotations.add(token);

         // set the stem and part of speech if present
         if (posTags != null) {
           token.setStringValue(posFeature, posTags[tokenIndex]);
         }
         if (stems != null) {
           token.setStringValue(stemFeature, stems[tokenIndex]);
         }
         tokenIndex++;
       }
       if (!tokenAnnotations.isEmpty()) {
         int begin = tokenAnnotations.get(0).getBegin();
         int end = tokenAnnotations.get(tokenAnnotations.size() - 1).getEnd();
         AnnotationFactory.createAnnotation(aJCas, begin, end, sentenceClass);
       }
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.uima.fit.testing.factory;

	import java.util.ArrayList;
	import java.util.List;

	import org.apache.uima.UIMAException;
	import org.apache.uima.cas.Feature;
	import org.apache.uima.fit.factory.AnnotationFactory;
	import org.apache.uima.jcas.JCas;
	import org.apache.uima.jcas.tcas.Annotation;

	/**
	* This class provides convenience methods for creating tokens and sentences and add them to a
	* {@link JCas}.
	*
	* @param <TOKEN_TYPE>
	* the type system token type (e.g. {@code org.apache.uima.fit.examples.type.Token})
	* @param <SENTENCE_TYPE>
	* the type system sentence type (e.g. {@code org.apache.uima.fit.examples.type.Sentence})
	*/

	public class TokenBuilder<TOKEN_TYPE extends Annotation, SENTENCE_TYPE extends Annotation> {
	private Class<TOKEN_TYPE> tokenClass;

	private Class<SENTENCE_TYPE> sentenceClass;

	private String posFeatureName;

	private String stemFeatureName;

	/**
	* Calls {@link TokenBuilder#TokenBuilder(Class, Class, String, String)} with the last two
	* arguments as null.
	*
	* @param aTokenClass
	* the class of your token type from your type system (e.g.
	* org.apache.uima.fit.type.Token.class)
	* @param aSentenceClass
	* the class of your sentence type from your type system (e.g.
	* org.apache.uima.fit.type.Sentence.class)
	*/
	public TokenBuilder(Class<TOKEN_TYPE> aTokenClass, Class<SENTENCE_TYPE> aSentenceClass) {
	this(aTokenClass, aSentenceClass, null, null);
	}

	/**
	* Instantiates a TokenBuilder with the type system information that the builder needs to build
	* tokens.
	*
	* @param aTokenClass
	* the class of your token type from your type system (e.g.
	* org.apache.uima.fit.type.Token.class)
	* @param aSentenceClass
	* the class of your sentence type from your type system (e.g.
	* org.apache.uima.fit.type.Sentence.class)
	* @param aPosFeatureName
	* the feature name for the part-of-speech tag for your token type. This assumes that
	* there is a single string feature for which to put your pos tag. null is an ok value.
	* @param aStemFeatureName
	* the feature name for the stem for your token type. This assumes that there is a single
	* string feature for which to put your stem. null is an ok value.
	*/
	public TokenBuilder(final Class<TOKEN_TYPE> aTokenClass, final Class<SENTENCE_TYPE> aSentenceClass,
	String aPosFeatureName, String aStemFeatureName) {
	tokenClass = aTokenClass;
	sentenceClass = aSentenceClass;
	setPosFeatureName(aPosFeatureName);
	setStemFeatureName(aStemFeatureName);
	}

	/**
	* Instantiates a TokenBuilder with the type system information that the builder needs to build
	* tokens.
	*
	* @param <T>
	* the type system token type (e.g. org.apache.uima.fit.examples.type.Token)
	* @param <S>
	* the type system sentence type (e.g. {@code org.apache.uima.fit.examples.type.Sentence})
	* @param aTokenClass
	* the class of your token type from your type system (e.g.
	* {@code org.apache.uima.fit.type.Token})
	* @param aSentenceClass
	* the class of your sentence type from your type system (e.g.
	* {@code org.apache.uima.fit.type.Sentence})
	* @return the builder.
	*/
	public static <T extends Annotation, S extends Annotation> TokenBuilder<T, S> create(
	Class<T> aTokenClass, Class<S> aSentenceClass) {
	return new TokenBuilder<T, S>(aTokenClass, aSentenceClass);
	}

	/**
	* Set the feature name for the part-of-speech tag for your token type. This assumes that there is
	* a single string feature for which to put your pos tag. null is an ok value.
	*
	* @param aPosFeatureName
	* the part-of-speech feature name.
	*/
	public void setPosFeatureName(String aPosFeatureName) {
	posFeatureName = aPosFeatureName;
	}

	/**
	* Set the feature name for the stem for your token type. This assumes that there is a single
	* string feature for which to put your stem. null is an ok value.
	*
	* @param aStemFeatureName
	* the stem feature name.
	*/
	public void setStemFeatureName(String aStemFeatureName) {
	stemFeatureName = aStemFeatureName;
	}

	/**
	* Builds white-space delimited tokens from the input text.
	*
	* @param aJCas
	* the JCas to add the Token annotations to
	* @param aText
	* the text to initialize the {@link JCas} with
	*/
	public void buildTokens(JCas aJCas, String aText) {
	if (aText == null) {
	throw new IllegalArgumentException("text may not be null.");
	}
	buildTokens(aJCas, aText, aText, null, null);
	}

	/**
	* @param aJCas
	* the JCas to add the Token annotations to
	* @param aText
	* the text to initialize the {@link JCas} with
	* @param aTokensString
	* the tokensString must have the same non-white space characters as the text. The
	* tokensString is used to identify token boundaries using white space - i.e. the only
	* difference between the 'text' parameter and the 'tokensString' parameter is that the
	* latter may have more whitespace characters. For example, if the text is "She ran."
	* then the tokensString might be "She ran ."
	* @see #buildTokens(JCas, String, String, String, String)
	*/
	public void buildTokens(JCas aJCas, String aText, String aTokensString) {
	if (aTokensString == null) {
	throw new IllegalArgumentException("tokensString may not be null.");
	}
	buildTokens(aJCas, aText, aTokensString, null, null);
	}

	/**
	* @param aJCas
	* the JCas to add the Token annotations to
	* @param aText
	* the text to initialize the {@link JCas} with
	* @param aTokensString
	* the tokensString must have the same non-white space characters as the text. The
	* tokensString is used to identify token boundaries using white space - i.e. the only
	* difference between the 'text' parameter and the 'tokensString' parameter is that the
	* latter may have more whitespace characters. For example, if the text is "She ran."
	* then the tokensString might be "She ran ."
	* @param aPosTagsString
	* the posTagsString should be a space delimited string of part-of-speech tags - one for
	* each token
	* @see #buildTokens(JCas, String, String, String, String)
	*/
	public void buildTokens(JCas aJCas, String aText, String aTokensString, String aPosTagsString) {
	buildTokens(aJCas, aText, aTokensString, aPosTagsString, null);
	}

	/**
	* Build tokens for the given text, tokens, part-of-speech tags, and word stems.
	*
	* @param aJCas
	* the JCas to add the Token annotations to
	* @param aText
	* the text to initialize the {@link JCas} with
	* @param aTokensString
	* the tokensString must have the same non-white space characters as the text. The
	* tokensString is used to identify token boundaries using white space - i.e. the only
	* difference between the 'text' parameter and the 'tokensString' parameter is that the
	* latter may have more whitespace characters. For example, if the text is "She ran."
	* then the tokensString might be "She ran ."
	* @param aPosTagsString
	* the posTagsString should be a space delimited string of part-of-speech tags - one for
	* each token
	* @param aStemsString
	* the stemsString should be a space delimited string of stems - one for each token
	*/
	public void buildTokens(JCas aJCas, String aText, String aTokensString, String aPosTagsString,
	String aStemsString) {
	aJCas.setDocumentText(aText);

	if (aPosTagsString != null && posFeatureName == null) {
	throw new IllegalArgumentException("posTagsString must be null if TokenBuilder is "
	+ "not initialized with a feature name corresponding to the part-of-speech "
	+ "feature of the token type (assuming your token type has such a feature).");
	}

	if (aStemsString != null && stemFeatureName == null) {
	throw new IllegalArgumentException("stemsString must be null if TokenBuilder is not "
	+ "initialized with a feature name corresponding to the part-of-speech feature "
	+ "of the token type (assuming your token type has such a feature).");
	}

	Feature posFeature = null;
	if (posFeatureName != null) {
	// String fullPosFeatureName = tokenClass.getClass().getName()+":"+posFeatureName;
	// posFeature = jCas.getTypeSystem().getFeatureByFullName(fullPosFeatureName);
	posFeature = aJCas.getTypeSystem().getType(tokenClass.getName())
	.getFeatureByBaseName(posFeatureName);
	}
	Feature stemFeature = null;
	if (stemFeatureName != null) {
	stemFeature = aJCas.getTypeSystem().getType(tokenClass.getName())
	.getFeatureByBaseName(stemFeatureName);
	}

	String tokensString = aTokensString.replaceAll("\\s\n\\s", "\n");
	String[] sentenceStrings = tokensString.split("\n");
	String[] posTags = aPosTagsString != null ? aPosTagsString.split("\\s+") : null;
	String[] stems = aStemsString != null ? aStemsString.split("\\s+") : null;

	int offset = 0;
	int tokenIndex = 0;

	for (String sentenceString : sentenceStrings) {
	String[] tokenStrings = sentenceString.trim().split("\\s+");
	List<Annotation> tokenAnnotations = new ArrayList<Annotation>();
	for (String tokenString : tokenStrings) {
	// move the offset up to the beginning of the token
	while (!aText.startsWith(tokenString, offset)) {
	offset++;
	if (offset > aText.length()) {
	throw new IllegalArgumentException(String.format("unable to find string %s",
	tokenString));
	}
	}

	// add the Token
	int start = offset;
	offset = offset + tokenString.length();
	Annotation token = AnnotationFactory.createAnnotation(aJCas, start, offset, tokenClass);
	tokenAnnotations.add(token);

	// set the stem and part of speech if present
	if (posTags != null) {
	token.setStringValue(posFeature, posTags[tokenIndex]);
	}
	if (stems != null) {
	token.setStringValue(stemFeature, stems[tokenIndex]);
	}
	tokenIndex++;
	}
	if (!tokenAnnotations.isEmpty()) {
	int begin = tokenAnnotations.get(0).getBegin();
	int end = tokenAnnotations.get(tokenAnnotations.size() - 1).getEnd();
	AnnotationFactory.createAnnotation(aJCas, begin, end, sentenceClass);
	}
	}
	}
	}