uimaj-2.3.0-incubating/uimaj-core/src/test/java/org/apache/uima/cas/test/SerializationNoMDTest.java - uima-uimaj - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.uima.cas.test;

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;

 import junit.framework.TestCase;

 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.CASException;
 import org.apache.uima.cas.FSIndex;
 import org.apache.uima.cas.FSIndexRepository;
 import org.apache.uima.cas.FSIterator;
 import org.apache.uima.cas.Feature;
 import org.apache.uima.cas.FeatureStructure;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.TypeSystem;
 import org.apache.uima.cas.admin.CASFactory;
 import org.apache.uima.cas.admin.CASMgr;
 import org.apache.uima.cas.admin.TypeSystemMgr;
 import org.apache.uima.cas.impl.CASImpl;
 import org.apache.uima.cas.impl.CASSerializer;
 import org.apache.uima.cas.impl.Serialization;
 import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.internal.util.TextStringTokenizer;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.resource.metadata.TypeSystemDescription;
 import org.apache.uima.test.junit_extension.JUnitExtension;
 import org.apache.uima.util.CasCreationUtils;

 /**
  * Class comment for TokenizerTest.java goes here.
  *
  */
 public class SerializationNoMDTest extends TestCase {

   public static final String TOKEN_TYPE = "Token";

   public static final String TOKEN_TYPE_FEAT = "type";

   public static final String TOKEN_TYPE_FEAT_Q = TOKEN_TYPE + TypeSystem.FEATURE_SEPARATOR
           + TOKEN_TYPE_FEAT;

   public static final String TOKEN_TYPE_TYPE = "TokenType";

   public static final String WORD_TYPE = "Word";

   public static final String SEP_TYPE = "Separator";

   public static final String EOS_TYPE = "EndOfSentence";

   public static final String SENT_TYPE = "Sentence";

   private CASMgr casMgr;

   private CAS cas;

   private Type wordType;

   private Type separatorType;

   private Type eosType;

   private Type tokenType;

   private Feature tokenTypeFeature;

   private Type sentenceType;

   private Feature startFeature;

   private Feature endFeature;

   public SerializationNoMDTest(String arg) {
     super(arg);
   }

   /**
    * @see junit.framework.TestCase#setUp()
    */
   public void setUp() throws Exception {
     super.setUp();
     casMgr = initCAS();
     cas = (CASImpl)casMgr;

     TypeSystem ts = cas.getTypeSystem();
     wordType = ts.getType(WORD_TYPE);
     // assert(wordType != null);
     separatorType = ts.getType(SEP_TYPE);
     eosType = ts.getType(EOS_TYPE);
     tokenType = ts.getType(TOKEN_TYPE);
     tokenTypeFeature = ts.getFeatureByFullName(TOKEN_TYPE_FEAT_Q);
     startFeature = ts.getFeatureByFullName(CAS.FEATURE_FULL_NAME_BEGIN);
     endFeature = ts.getFeatureByFullName(CAS.FEATURE_FULL_NAME_END);
     sentenceType = ts.getType(SENT_TYPE);
   }

   public void tearDown() {
     casMgr = null;
     cas = null;
     wordType = null;
     separatorType = null;
     eosType = null;
     tokenType = null;
     tokenTypeFeature = null;
     startFeature = null;
     endFeature = null;
     sentenceType = null;

   }

   // Initialize the first CAS.
   private static CASMgr initCAS() {
     // Create an initial CASMgr from the factory.
     // CASMgr cas = CASFactory.createCAS();
     // assert(tsa != null);
     // Create a CASMgr. Ensures existence of AnnotationFS type.
     // CASMgr tcas = CASFactory.createCAS();
     CASMgr aCas = CASFactory.createCAS();
     try {
       CasCreationUtils.setupTypeSystem(aCas, (TypeSystemDescription) null);
     } catch (ResourceInitializationException e) {
       e.printStackTrace();
     }
     // Create a writable type system.
     TypeSystemMgr tsa = aCas.getTypeSystemMgr();
     // Add new types and features.
     Type topType = tsa.getTopType();
     Type annotType = tsa.getType(CAS.TYPE_NAME_ANNOTATION);
     // assert(annotType != null);
     tsa.addType(SENT_TYPE, annotType);
     Type tokenType = tsa.addType(TOKEN_TYPE, annotType);
     Type tokenTypeType = tsa.addType(TOKEN_TYPE_TYPE, topType);
     tsa.addType(WORD_TYPE, tokenTypeType);
     tsa.addType(SEP_TYPE, tokenTypeType);
     tsa.addType(EOS_TYPE, tokenTypeType);
     tsa.addFeature(TOKEN_TYPE_FEAT, tokenType, tokenTypeType);
     // Commit the type system.
     ((CASImpl) aCas).commitTypeSystem();
     // assert(tsa.isCommitted());
     // // Create the CAS indexes.
     // tcas.initCASIndexes();
     // Create the Base indexes.
     try {
       aCas.initCASIndexes();
     } catch (CASException e) {
       e.printStackTrace();
     }

     // Commit the index repository.
     aCas.getIndexRepositoryMgr().commit();
     // assert(cas.getIndexRepositoryMgr().isCommitted());

     // Create the default text Sofa and return CAS view
     return (CASMgr) aCas.getCAS().getCurrentView();
   }

   // Tokenize text.
   private void tokenize() throws Exception {
     // System.out.println("Tokenizing text.");

     // Create FSs for the token types.
     FeatureStructure wordFS = cas.createFS(wordType);
     FeatureStructure sepFS = cas.createFS(separatorType);
     FeatureStructure eosFS = cas.createFS(eosType);

     String text = cas.getDocumentText();
     TextStringTokenizer tokenizer = new TextStringTokenizer(text);
     tokenizer.setSeparators("/-*&@");
     tokenizer.addWhitespaceChars(",");
     tokenizer.setEndOfSentenceChars(".!?");
     tokenizer.setShowWhitespace(false);
     int tokenTypeCode;
     int wordCounter = 0;
     int sepCounter = 0;
     int endOfSentenceCounter = 0;
     AnnotationFS tokenAnnot;
     while (tokenizer.isValid()) {
       tokenAnnot = cas.createAnnotation(tokenType, tokenizer.getTokenStart(), tokenizer
               .getTokenEnd());
       tokenTypeCode = tokenizer.getTokenType();
       switch (tokenTypeCode) {
         case TextStringTokenizer.EOS: {
           ++endOfSentenceCounter;
           tokenAnnot.setFeatureValue(tokenTypeFeature, eosFS);
           break;
         }
         case TextStringTokenizer.SEP: {
           ++sepCounter;
           tokenAnnot.setFeatureValue(tokenTypeFeature, sepFS);
           break;
         }
         case TextStringTokenizer.WSP: {
           break;
         }
         case TextStringTokenizer.WCH: {
           ++wordCounter;
           tokenAnnot.setFeatureValue(tokenTypeFeature, wordFS);
           // if ((wordCounter % 100000) == 0) {
           // System.out.println("Number of words tokenized: " + wordCounter);
           // }
           break;
         }
         default: {
           throw new Exception("Something went wrong, fire up that debugger!");
         }
       }
       cas.getIndexRepository().addFS(tokenAnnot);
       tokenizer.setToNext();
       // System.out.println("Token: " + tokenizer.nextToken());
     }
     // time = System.currentTimeMillis() - time;
     // System.out.println("Number of words: " + wordCounter);
     // int allTokens = wordCounter + sepCounter + endOfSentenceCounter;
     // System.out.println("Number of tokens: " + allTokens);
     // System.out.println("Time used: " + new TimeSpan(time));

     // FSIterator it = cas.getAnnotationIndex(tokenType).iterator();
     // int count = 0;
     // while (it.isValid()) {
     // ++count;
     // it.moveToNext();
     // }
     // System.out.println("Number of tokens in index: " + count);
   }

   // Very (!) primitive EOS detection.
   private void createSentences() {
     // TypeSystem ts = cas.getTypeSystem();
     // Type eosType = ts.getType(EOS_TYPE);
     // Type tokenType = ts.getType(TOKEN_TYPE);
     // //assert(tokenType != null);
     // Type sentenceType = ts.getType(SENT_TYPE);
     // Feature tokenTypeFeature = ts.getFeature(TOKEN_TYPE_FEAT);
     // Feature startFeature = ts.getFeature(CAS.START_FEAT);
     // Feature endFeature = ts.getFeature(CAS.END_FEAT);

     // System.out.println("\nCreating sentence annotations.");

     // Get a handle to the index repository.
     FSIndexRepository indexRepository = cas.getIndexRepository();
     // assert(indexRepository != null);
     Iterator<String> labelIt = indexRepository.getLabels();
     assertTrue(labelIt != null);
     // Get the standard index for tokens.
     FSIndex<AnnotationFS> tokenIndex = cas.getAnnotationIndex(tokenType);
     // assert(tokenIndex != null);
     // Get an iterator over tokens.
     FSIterator<AnnotationFS> it = tokenIndex.iterator();
     // assert(it != null);
     // Now create sentences. We do this as follows: a sentence starts where
     // the first token after an EOS starts, and ends with an EOS.
     long time = System.currentTimeMillis();
     int endOfSentenceCounter = 0;
     it.moveToFirst();
     boolean lookForStart = true;
     int start = 0, end; // Initialize start to pacify compiler.
     FeatureStructure tokenFS, sentFS;
     while (it.isValid()) {
       if (lookForStart) {
         // If we're looking for the start of a sentence, just grab the start
         // of the current FS.
         start = it.get().getIntValue(startFeature);
         lookForStart = false;
       } else {
         // Check if we've reached the end of a sentence.
         tokenFS = it.get();
         if (tokenFS.getFeatureValue(tokenTypeFeature).getType() == eosType) {
           end = tokenFS.getIntValue(endFeature);
           sentFS = cas.createFS(sentenceType);
           sentFS.setIntValue(startFeature, start);
           sentFS.setIntValue(endFeature, end);
           cas.getIndexRepository().addFS(sentFS);
           ++endOfSentenceCounter;
           lookForStart = true;
         }
       }
       it.moveToNext();
     }
     time = System.currentTimeMillis() - time;
     // System.out.println("Created " + endOfSentenceCounter + " sentences: " + new TimeSpan(time));
   }

   // Check results.
   private void checkSentences() {
     TypeSystem ts = cas.getTypeSystem();
     Type localSentenceType = ts.getType(SENT_TYPE);
     // Feature tokenTypeFeature = ts.getFeatureByFullName(TOKEN_TYPE_FEAT);
     // Feature startFeature = ts.getFeatureByFullName(CAS.FEATURE_BASE_NAME_BEGIN);
     // Feature endFeature = ts.getFeatureByFullName(CAS.FEATURE_BASE_NAME_END);

     // Print the first few sentences.
     // System.out.println("\nThe first 10 sentences:\n");
     FSIndex<AnnotationFS> sentenceIndex = cas.getAnnotationIndex(localSentenceType);
     FSIterator<AnnotationFS> it = sentenceIndex.iterator();
     AnnotationFS sentFS;
     if (it.isValid()) {
       sentFS = (AnnotationFS) it.get();
       assertTrue(sentFS.getCoveredText() != null);
     }
     // int counter = 0;
     String text = cas.getDocumentText();
     assertTrue(text != null);
     // while (it.isValid() && counter < 10) {
     // sentFS = (AnnotationFS)it.get();
     // System.out.println(
     // "Sentence: "
     // + sentFS.getCoveredText());
     // it.moveToNext();
     // ++counter;
     // }

     // Now get an iterator over all annotations.
     FSIndex<AnnotationFS> annotIndex = cas.getAnnotationIndex();
     // System.out.println("\nNumber of annotations in index: " + annotIndex.size());

     // Print the first few sentences.
     // System.out.println("The first 50 annotations:\n");

     it = annotIndex.iterator();
     // assert(it.isValid());
     // counter = 0;
     // AnnotationFS fs;
     // while (it.isValid() && counter < 50) {
     // fs = (AnnotationFS)it.get();
     // System.out.print(fs.getType().getName() + ": ");
     // if (fs.getType().getName().equals(CASMgr.DOCUMENT_TYPE)) {
     // // When we see the document, we don't print the whole text ;-)
     // System.out.println("...");
     // } else {
     // System.out.println(
     // fs.getCoveredText());
     // }
     // it.moveToNext();
     // ++counter;
     // }
   }

   // private static String file2String(String file) throws IOException {
   // return file2String(new File(file));
   // }

   /**
    * Read the contents of a file into a string, using the default platform encoding.
    *
    * @param file
    *          The file to be read in.
    * @return String The contents of the file.
    * @throws IOException
    *           Various I/O errors.
    */
   public static String file2String(File file) throws IOException {
     // Read the file into a string using a char buffer.
     FileReader reader = null;
     int bufSize = (int) file.length(); // length in bytes >= length in chars due to encoding
     char[] buf = new char[bufSize];
     int read_so_far = 0;
     try {
       reader = new FileReader(file);
       while (read_so_far < bufSize) {
         int count = reader.read(buf, read_so_far, bufSize - read_so_far);
         if (count < 0) {
           break;
         }
         read_so_far += count;
       }

     } finally {
       if (null != reader)
         reader.close();
     }
     return new String(buf, 0, read_so_far);
   }

   /**
    * Test driver.
    */
   public void testMain() throws Exception {

     // Read the document into a String. I'm sure there are better ways to
     // do this.
     File textFile = JUnitExtension.getFile("data/moby.txt");
     String moby = file2String(textFile);
     // String moby = file2String(System.getProperty("cas.data.test") + "moby.txt");
     String line;
     BufferedReader br = new BufferedReader(new StringReader(moby));
     StringBuffer buf = new StringBuffer();
     List<String> docs = new ArrayList<String>();
     while ((line = br.readLine()) != null) {
       if (line.startsWith(".. <p")) {
         docs.add(buf.toString());
         buf = new StringBuffer();
       } else {
         buf.append(line + "\n");
       }
     }
     docs.add(buf.toString());
     buf = null;

     final int numDocs = docs.size();
     final int max = 30;
     int docCount = 0;
     long overallTime = System.currentTimeMillis();
     int numTok, numSent;
     CASSerializer cs;
     while (docCount < max) {
       for (int i = 0; i < numDocs && docCount < max; i++) {
         // System.out.println("Processing document: " + i);
         // Set document text in first CAS.
         cas.setDocumentText((String) docs.get(i));

         tokenize();
         numTok = cas.getAnnotationIndex(tokenType).size();
         assertTrue(numTok > 0);
         // System.out.println(" Number of tokens: " + numTok);

         // System.out.println("Serializing...");
         cs = Serialization.serializeNoMetaData(cas);
         cas = Serialization.createCAS(casMgr, cs);

         assertTrue(numTok == cas.getAnnotationIndex(tokenType).size());

         createSentences();
         numSent = cas.getAnnotationIndex(sentenceType).size();
         assertTrue(numSent > 0);
         // System.out.println(" Number of sentences: " + numSent);

         // System.out.println("Serializing...");
         cs = Serialization.serializeNoMetaData(cas);
         cas = Serialization.createCAS(casMgr, cs);

         assertTrue(numTok == cas.getAnnotationIndex(tokenType).size());
         assertTrue(numSent == cas.getAnnotationIndex(sentenceType).size());
         // System.out.println(" Number of tokens: " + numTok);
         checkSentences();

         // System.out.println("Serializing...");
         cs = Serialization.serializeNoMetaData(cas);
         cas = Serialization.createCAS(casMgr, cs);

         assertTrue(numTok == cas.getAnnotationIndex(tokenType).size());
         assertTrue(numSent == cas.getAnnotationIndex(sentenceType).size());
         // System.out.println(" Verify: " + numTok + " tokens, " + numSent + " sentences.");

         casMgr.reset();

         ++docCount;
       }
       // System.out.println("Number of documents processed: " + docCount);
     }
     overallTime = System.currentTimeMillis() - overallTime;
     // System.out.println("Time taken over all: " + new TimeSpan(overallTime));

   }

   public static void main(String[] args) {
     junit.textui.TestRunner.run(SerializationNoMDTest.class);
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.uima.cas.test;

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileReader;
	import java.io.IOException;
	import java.io.StringReader;
	import java.util.ArrayList;
	import java.util.Iterator;
	import java.util.List;

	import junit.framework.TestCase;

	import org.apache.uima.cas.CAS;
	import org.apache.uima.cas.CASException;
	import org.apache.uima.cas.FSIndex;
	import org.apache.uima.cas.FSIndexRepository;
	import org.apache.uima.cas.FSIterator;
	import org.apache.uima.cas.Feature;
	import org.apache.uima.cas.FeatureStructure;
	import org.apache.uima.cas.Type;
	import org.apache.uima.cas.TypeSystem;
	import org.apache.uima.cas.admin.CASFactory;
	import org.apache.uima.cas.admin.CASMgr;
	import org.apache.uima.cas.admin.TypeSystemMgr;
	import org.apache.uima.cas.impl.CASImpl;
	import org.apache.uima.cas.impl.CASSerializer;
	import org.apache.uima.cas.impl.Serialization;
	import org.apache.uima.cas.text.AnnotationFS;
	import org.apache.uima.internal.util.TextStringTokenizer;
	import org.apache.uima.resource.ResourceInitializationException;
	import org.apache.uima.resource.metadata.TypeSystemDescription;
	import org.apache.uima.test.junit_extension.JUnitExtension;
	import org.apache.uima.util.CasCreationUtils;

	/**
	* Class comment for TokenizerTest.java goes here.
	*
	*/
	public class SerializationNoMDTest extends TestCase {

	public static final String TOKEN_TYPE = "Token";

	public static final String TOKEN_TYPE_FEAT = "type";

	public static final String TOKEN_TYPE_FEAT_Q = TOKEN_TYPE + TypeSystem.FEATURE_SEPARATOR
	+ TOKEN_TYPE_FEAT;

	public static final String TOKEN_TYPE_TYPE = "TokenType";

	public static final String WORD_TYPE = "Word";

	public static final String SEP_TYPE = "Separator";

	public static final String EOS_TYPE = "EndOfSentence";

	public static final String SENT_TYPE = "Sentence";

	private CASMgr casMgr;

	private CAS cas;

	private Type wordType;

	private Type separatorType;

	private Type eosType;

	private Type tokenType;

	private Feature tokenTypeFeature;

	private Type sentenceType;

	private Feature startFeature;

	private Feature endFeature;

	public SerializationNoMDTest(String arg) {
	super(arg);
	}

	/**
	* @see junit.framework.TestCase#setUp()
	*/
	public void setUp() throws Exception {
	super.setUp();
	casMgr = initCAS();
	cas = (CASImpl)casMgr;

	TypeSystem ts = cas.getTypeSystem();
	wordType = ts.getType(WORD_TYPE);
	// assert(wordType != null);
	separatorType = ts.getType(SEP_TYPE);
	eosType = ts.getType(EOS_TYPE);
	tokenType = ts.getType(TOKEN_TYPE);
	tokenTypeFeature = ts.getFeatureByFullName(TOKEN_TYPE_FEAT_Q);
	startFeature = ts.getFeatureByFullName(CAS.FEATURE_FULL_NAME_BEGIN);
	endFeature = ts.getFeatureByFullName(CAS.FEATURE_FULL_NAME_END);
	sentenceType = ts.getType(SENT_TYPE);
	}

	public void tearDown() {
	casMgr = null;
	cas = null;
	wordType = null;
	separatorType = null;
	eosType = null;
	tokenType = null;
	tokenTypeFeature = null;
	startFeature = null;
	endFeature = null;
	sentenceType = null;

	}

	// Initialize the first CAS.
	private static CASMgr initCAS() {
	// Create an initial CASMgr from the factory.
	// CASMgr cas = CASFactory.createCAS();
	// assert(tsa != null);
	// Create a CASMgr. Ensures existence of AnnotationFS type.
	// CASMgr tcas = CASFactory.createCAS();
	CASMgr aCas = CASFactory.createCAS();
	try {
	CasCreationUtils.setupTypeSystem(aCas, (TypeSystemDescription) null);
	} catch (ResourceInitializationException e) {
	e.printStackTrace();
	}
	// Create a writable type system.
	TypeSystemMgr tsa = aCas.getTypeSystemMgr();
	// Add new types and features.
	Type topType = tsa.getTopType();
	Type annotType = tsa.getType(CAS.TYPE_NAME_ANNOTATION);
	// assert(annotType != null);
	tsa.addType(SENT_TYPE, annotType);
	Type tokenType = tsa.addType(TOKEN_TYPE, annotType);
	Type tokenTypeType = tsa.addType(TOKEN_TYPE_TYPE, topType);
	tsa.addType(WORD_TYPE, tokenTypeType);
	tsa.addType(SEP_TYPE, tokenTypeType);
	tsa.addType(EOS_TYPE, tokenTypeType);
	tsa.addFeature(TOKEN_TYPE_FEAT, tokenType, tokenTypeType);
	// Commit the type system.
	((CASImpl) aCas).commitTypeSystem();
	// assert(tsa.isCommitted());
	// // Create the CAS indexes.
	// tcas.initCASIndexes();
	// Create the Base indexes.
	try {
	aCas.initCASIndexes();
	} catch (CASException e) {
	e.printStackTrace();
	}

	// Commit the index repository.
	aCas.getIndexRepositoryMgr().commit();
	// assert(cas.getIndexRepositoryMgr().isCommitted());

	// Create the default text Sofa and return CAS view
	return (CASMgr) aCas.getCAS().getCurrentView();
	}

	// Tokenize text.
	private void tokenize() throws Exception {
	// System.out.println("Tokenizing text.");

	// Create FSs for the token types.
	FeatureStructure wordFS = cas.createFS(wordType);
	FeatureStructure sepFS = cas.createFS(separatorType);
	FeatureStructure eosFS = cas.createFS(eosType);

	String text = cas.getDocumentText();
	TextStringTokenizer tokenizer = new TextStringTokenizer(text);
	tokenizer.setSeparators("/-*&@");
	tokenizer.addWhitespaceChars(",");
	tokenizer.setEndOfSentenceChars(".!?");
	tokenizer.setShowWhitespace(false);
	int tokenTypeCode;
	int wordCounter = 0;
	int sepCounter = 0;
	int endOfSentenceCounter = 0;
	AnnotationFS tokenAnnot;
	while (tokenizer.isValid()) {
	tokenAnnot = cas.createAnnotation(tokenType, tokenizer.getTokenStart(), tokenizer
	.getTokenEnd());
	tokenTypeCode = tokenizer.getTokenType();
	switch (tokenTypeCode) {
	case TextStringTokenizer.EOS: {
	++endOfSentenceCounter;
	tokenAnnot.setFeatureValue(tokenTypeFeature, eosFS);
	break;
	}
	case TextStringTokenizer.SEP: {
	++sepCounter;
	tokenAnnot.setFeatureValue(tokenTypeFeature, sepFS);
	break;
	}
	case TextStringTokenizer.WSP: {
	break;
	}
	case TextStringTokenizer.WCH: {
	++wordCounter;
	tokenAnnot.setFeatureValue(tokenTypeFeature, wordFS);
	// if ((wordCounter % 100000) == 0) {
	// System.out.println("Number of words tokenized: " + wordCounter);
	// }
	break;
	}
	default: {
	throw new Exception("Something went wrong, fire up that debugger!");
	}
	}
	cas.getIndexRepository().addFS(tokenAnnot);
	tokenizer.setToNext();
	// System.out.println("Token: " + tokenizer.nextToken());
	}
	// time = System.currentTimeMillis() - time;
	// System.out.println("Number of words: " + wordCounter);
	// int allTokens = wordCounter + sepCounter + endOfSentenceCounter;
	// System.out.println("Number of tokens: " + allTokens);
	// System.out.println("Time used: " + new TimeSpan(time));

	// FSIterator it = cas.getAnnotationIndex(tokenType).iterator();
	// int count = 0;
	// while (it.isValid()) {
	// ++count;
	// it.moveToNext();
	// }
	// System.out.println("Number of tokens in index: " + count);
	}

	// Very (!) primitive EOS detection.
	private void createSentences() {
	// TypeSystem ts = cas.getTypeSystem();
	// Type eosType = ts.getType(EOS_TYPE);
	// Type tokenType = ts.getType(TOKEN_TYPE);
	// //assert(tokenType != null);
	// Type sentenceType = ts.getType(SENT_TYPE);
	// Feature tokenTypeFeature = ts.getFeature(TOKEN_TYPE_FEAT);
	// Feature startFeature = ts.getFeature(CAS.START_FEAT);
	// Feature endFeature = ts.getFeature(CAS.END_FEAT);

	// System.out.println("\nCreating sentence annotations.");

	// Get a handle to the index repository.
	FSIndexRepository indexRepository = cas.getIndexRepository();
	// assert(indexRepository != null);
	Iterator<String> labelIt = indexRepository.getLabels();
	assertTrue(labelIt != null);
	// Get the standard index for tokens.
	FSIndex<AnnotationFS> tokenIndex = cas.getAnnotationIndex(tokenType);
	// assert(tokenIndex != null);
	// Get an iterator over tokens.
	FSIterator<AnnotationFS> it = tokenIndex.iterator();
	// assert(it != null);
	// Now create sentences. We do this as follows: a sentence starts where
	// the first token after an EOS starts, and ends with an EOS.
	long time = System.currentTimeMillis();
	int endOfSentenceCounter = 0;
	it.moveToFirst();
	boolean lookForStart = true;
	int start = 0, end; // Initialize start to pacify compiler.
	FeatureStructure tokenFS, sentFS;
	while (it.isValid()) {
	if (lookForStart) {
	// If we're looking for the start of a sentence, just grab the start
	// of the current FS.
	start = it.get().getIntValue(startFeature);
	lookForStart = false;
	} else {
	// Check if we've reached the end of a sentence.
	tokenFS = it.get();
	if (tokenFS.getFeatureValue(tokenTypeFeature).getType() == eosType) {
	end = tokenFS.getIntValue(endFeature);
	sentFS = cas.createFS(sentenceType);
	sentFS.setIntValue(startFeature, start);
	sentFS.setIntValue(endFeature, end);
	cas.getIndexRepository().addFS(sentFS);
	++endOfSentenceCounter;
	lookForStart = true;
	}
	}
	it.moveToNext();
	}
	time = System.currentTimeMillis() - time;
	// System.out.println("Created " + endOfSentenceCounter + " sentences: " + new TimeSpan(time));
	}

	// Check results.
	private void checkSentences() {
	TypeSystem ts = cas.getTypeSystem();
	Type localSentenceType = ts.getType(SENT_TYPE);
	// Feature tokenTypeFeature = ts.getFeatureByFullName(TOKEN_TYPE_FEAT);
	// Feature startFeature = ts.getFeatureByFullName(CAS.FEATURE_BASE_NAME_BEGIN);
	// Feature endFeature = ts.getFeatureByFullName(CAS.FEATURE_BASE_NAME_END);

	// Print the first few sentences.
	// System.out.println("\nThe first 10 sentences:\n");
	FSIndex<AnnotationFS> sentenceIndex = cas.getAnnotationIndex(localSentenceType);
	FSIterator<AnnotationFS> it = sentenceIndex.iterator();
	AnnotationFS sentFS;
	if (it.isValid()) {
	sentFS = (AnnotationFS) it.get();
	assertTrue(sentFS.getCoveredText() != null);
	}
	// int counter = 0;
	String text = cas.getDocumentText();
	assertTrue(text != null);
	// while (it.isValid() && counter < 10) {
	// sentFS = (AnnotationFS)it.get();
	// System.out.println(
	// "Sentence: "
	// + sentFS.getCoveredText());
	// it.moveToNext();
	// ++counter;
	// }

	// Now get an iterator over all annotations.
	FSIndex<AnnotationFS> annotIndex = cas.getAnnotationIndex();
	// System.out.println("\nNumber of annotations in index: " + annotIndex.size());

	// Print the first few sentences.
	// System.out.println("The first 50 annotations:\n");

	it = annotIndex.iterator();
	// assert(it.isValid());
	// counter = 0;
	// AnnotationFS fs;
	// while (it.isValid() && counter < 50) {
	// fs = (AnnotationFS)it.get();
	// System.out.print(fs.getType().getName() + ": ");
	// if (fs.getType().getName().equals(CASMgr.DOCUMENT_TYPE)) {
	// // When we see the document, we don't print the whole text ;-)
	// System.out.println("...");
	// } else {
	// System.out.println(
	// fs.getCoveredText());
	// }
	// it.moveToNext();
	// ++counter;
	// }
	}

	// private static String file2String(String file) throws IOException {
	// return file2String(new File(file));
	// }

	/**
	* Read the contents of a file into a string, using the default platform encoding.
	*
	* @param file
	* The file to be read in.
	* @return String The contents of the file.
	* @throws IOException
	* Various I/O errors.
	*/
	public static String file2String(File file) throws IOException {
	// Read the file into a string using a char buffer.
	FileReader reader = null;
	int bufSize = (int) file.length(); // length in bytes >= length in chars due to encoding
	char[] buf = new char[bufSize];
	int read_so_far = 0;
	try {
	reader = new FileReader(file);
	while (read_so_far < bufSize) {
	int count = reader.read(buf, read_so_far, bufSize - read_so_far);
	if (count < 0) {
	break;
	}
	read_so_far += count;
	}

	} finally {
	if (null != reader)
	reader.close();
	}
	return new String(buf, 0, read_so_far);
	}

	/**
	* Test driver.
	*/
	public void testMain() throws Exception {

	// Read the document into a String. I'm sure there are better ways to
	// do this.
	File textFile = JUnitExtension.getFile("data/moby.txt");
	String moby = file2String(textFile);
	// String moby = file2String(System.getProperty("cas.data.test") + "moby.txt");
	String line;
	BufferedReader br = new BufferedReader(new StringReader(moby));
	StringBuffer buf = new StringBuffer();
	List<String> docs = new ArrayList<String>();
	while ((line = br.readLine()) != null) {
	if (line.startsWith(".. <p")) {
	docs.add(buf.toString());
	buf = new StringBuffer();
	} else {
	buf.append(line + "\n");
	}
	}
	docs.add(buf.toString());
	buf = null;

	final int numDocs = docs.size();
	final int max = 30;
	int docCount = 0;
	long overallTime = System.currentTimeMillis();
	int numTok, numSent;
	CASSerializer cs;
	while (docCount < max) {
	for (int i = 0; i < numDocs && docCount < max; i++) {
	// System.out.println("Processing document: " + i);
	// Set document text in first CAS.
	cas.setDocumentText((String) docs.get(i));

	tokenize();
	numTok = cas.getAnnotationIndex(tokenType).size();
	assertTrue(numTok > 0);
	// System.out.println(" Number of tokens: " + numTok);

	// System.out.println("Serializing...");
	cs = Serialization.serializeNoMetaData(cas);
	cas = Serialization.createCAS(casMgr, cs);

	assertTrue(numTok == cas.getAnnotationIndex(tokenType).size());

	createSentences();
	numSent = cas.getAnnotationIndex(sentenceType).size();
	assertTrue(numSent > 0);
	// System.out.println(" Number of sentences: " + numSent);

	// System.out.println("Serializing...");
	cs = Serialization.serializeNoMetaData(cas);
	cas = Serialization.createCAS(casMgr, cs);

	assertTrue(numTok == cas.getAnnotationIndex(tokenType).size());
	assertTrue(numSent == cas.getAnnotationIndex(sentenceType).size());
	// System.out.println(" Number of tokens: " + numTok);
	checkSentences();

	// System.out.println("Serializing...");
	cs = Serialization.serializeNoMetaData(cas);
	cas = Serialization.createCAS(casMgr, cs);

	assertTrue(numTok == cas.getAnnotationIndex(tokenType).size());
	assertTrue(numSent == cas.getAnnotationIndex(sentenceType).size());
	// System.out.println(" Verify: " + numTok + " tokens, " + numSent + " sentences.");

	casMgr.reset();

	++docCount;
	}
	// System.out.println("Number of documents processed: " + docCount);
	}
	overallTime = System.currentTimeMillis() - overallTime;
	// System.out.println("Time taken over all: " + new TimeSpan(overallTime));

	}

	public static void main(String[] args) {
	junit.textui.TestRunner.run(SerializationNoMDTest.class);
	}

	}