blob: 8d8347122c0f05aacbfef924d5d228ba6c672814 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.cas.test;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import junit.framework.TestCase;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIndexRepository;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.admin.CASFactory;
import org.apache.uima.cas.admin.CASMgr;
import org.apache.uima.cas.admin.TypeSystemMgr;
import org.apache.uima.cas.impl.CASImpl;
import org.apache.uima.cas.impl.CASSerializer;
import org.apache.uima.cas.impl.Serialization;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.internal.util.TextStringTokenizer;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.test.junit_extension.JUnitExtension;
import org.apache.uima.util.CasCreationUtils;
/**
* Class comment for TokenizerTest.java goes here.
*
*/
public class SerializationNoMDTest extends TestCase {
public static final String TOKEN_TYPE = "Token";
public static final String TOKEN_TYPE_FEAT = "type";
public static final String TOKEN_TYPE_FEAT_Q = TOKEN_TYPE + TypeSystem.FEATURE_SEPARATOR
+ TOKEN_TYPE_FEAT;
public static final String TOKEN_TYPE_TYPE = "TokenType";
public static final String WORD_TYPE = "Word";
public static final String SEP_TYPE = "Separator";
public static final String EOS_TYPE = "EndOfSentence";
public static final String SENT_TYPE = "Sentence";
private CASMgr casMgr;
private CAS cas;
private Type wordType;
private Type separatorType;
private Type eosType;
private Type tokenType;
private Feature tokenTypeFeature;
private Type sentenceType;
private Feature startFeature;
private Feature endFeature;
public SerializationNoMDTest(String arg) {
super(arg);
}
/**
* @see junit.framework.TestCase#setUp()
*/
public void setUp() throws Exception {
super.setUp();
casMgr = initCAS();
cas = (CASImpl)casMgr;
TypeSystem ts = cas.getTypeSystem();
wordType = ts.getType(WORD_TYPE);
// assert(wordType != null);
separatorType = ts.getType(SEP_TYPE);
eosType = ts.getType(EOS_TYPE);
tokenType = ts.getType(TOKEN_TYPE);
tokenTypeFeature = ts.getFeatureByFullName(TOKEN_TYPE_FEAT_Q);
startFeature = ts.getFeatureByFullName(CAS.FEATURE_FULL_NAME_BEGIN);
endFeature = ts.getFeatureByFullName(CAS.FEATURE_FULL_NAME_END);
sentenceType = ts.getType(SENT_TYPE);
}
public void tearDown() {
casMgr = null;
cas = null;
wordType = null;
separatorType = null;
eosType = null;
tokenType = null;
tokenTypeFeature = null;
startFeature = null;
endFeature = null;
sentenceType = null;
}
// Initialize the first CAS.
private static CASMgr initCAS() {
// Create an initial CASMgr from the factory.
// CASMgr cas = CASFactory.createCAS();
// assert(tsa != null);
// Create a CASMgr. Ensures existence of AnnotationFS type.
// CASMgr tcas = CASFactory.createCAS();
CASMgr aCas = CASFactory.createCAS();
try {
CasCreationUtils.setupTypeSystem(aCas, (TypeSystemDescription) null);
} catch (ResourceInitializationException e) {
e.printStackTrace();
}
// Create a writable type system.
TypeSystemMgr tsa = aCas.getTypeSystemMgr();
// Add new types and features.
Type topType = tsa.getTopType();
Type annotType = tsa.getType(CAS.TYPE_NAME_ANNOTATION);
// assert(annotType != null);
tsa.addType(SENT_TYPE, annotType);
Type tokenType = tsa.addType(TOKEN_TYPE, annotType);
Type tokenTypeType = tsa.addType(TOKEN_TYPE_TYPE, topType);
tsa.addType(WORD_TYPE, tokenTypeType);
tsa.addType(SEP_TYPE, tokenTypeType);
tsa.addType(EOS_TYPE, tokenTypeType);
tsa.addFeature(TOKEN_TYPE_FEAT, tokenType, tokenTypeType);
// Commit the type system.
((CASImpl) aCas).commitTypeSystem();
// assert(tsa.isCommitted());
// // Create the CAS indexes.
// tcas.initCASIndexes();
// Create the Base indexes.
try {
aCas.initCASIndexes();
} catch (CASException e) {
e.printStackTrace();
}
// Commit the index repository.
aCas.getIndexRepositoryMgr().commit();
// assert(cas.getIndexRepositoryMgr().isCommitted());
// Create the default text Sofa and return CAS view
return (CASMgr) aCas.getCAS().getCurrentView();
}
// Tokenize text.
private void tokenize() throws Exception {
// System.out.println("Tokenizing text.");
// Create FSs for the token types.
FeatureStructure wordFS = cas.createFS(wordType);
FeatureStructure sepFS = cas.createFS(separatorType);
FeatureStructure eosFS = cas.createFS(eosType);
String text = cas.getDocumentText();
TextStringTokenizer tokenizer = new TextStringTokenizer(text);
tokenizer.setSeparators("/-*&@");
tokenizer.addWhitespaceChars(",");
tokenizer.setEndOfSentenceChars(".!?");
tokenizer.setShowWhitespace(false);
int tokenTypeCode;
int wordCounter = 0;
int sepCounter = 0;
int endOfSentenceCounter = 0;
AnnotationFS tokenAnnot;
while (tokenizer.isValid()) {
tokenAnnot = cas.createAnnotation(tokenType, tokenizer.getTokenStart(), tokenizer
.getTokenEnd());
tokenTypeCode = tokenizer.getTokenType();
switch (tokenTypeCode) {
case TextStringTokenizer.EOS: {
++endOfSentenceCounter;
tokenAnnot.setFeatureValue(tokenTypeFeature, eosFS);
break;
}
case TextStringTokenizer.SEP: {
++sepCounter;
tokenAnnot.setFeatureValue(tokenTypeFeature, sepFS);
break;
}
case TextStringTokenizer.WSP: {
break;
}
case TextStringTokenizer.WCH: {
++wordCounter;
tokenAnnot.setFeatureValue(tokenTypeFeature, wordFS);
// if ((wordCounter % 100000) == 0) {
// System.out.println("Number of words tokenized: " + wordCounter);
// }
break;
}
default: {
throw new Exception("Something went wrong, fire up that debugger!");
}
}
cas.getIndexRepository().addFS(tokenAnnot);
tokenizer.setToNext();
// System.out.println("Token: " + tokenizer.nextToken());
}
// time = System.currentTimeMillis() - time;
// System.out.println("Number of words: " + wordCounter);
// int allTokens = wordCounter + sepCounter + endOfSentenceCounter;
// System.out.println("Number of tokens: " + allTokens);
// System.out.println("Time used: " + new TimeSpan(time));
// FSIterator it = cas.getAnnotationIndex(tokenType).iterator();
// int count = 0;
// while (it.isValid()) {
// ++count;
// it.moveToNext();
// }
// System.out.println("Number of tokens in index: " + count);
}
// Very (!) primitive EOS detection.
private void createSentences() {
// TypeSystem ts = cas.getTypeSystem();
// Type eosType = ts.getType(EOS_TYPE);
// Type tokenType = ts.getType(TOKEN_TYPE);
// //assert(tokenType != null);
// Type sentenceType = ts.getType(SENT_TYPE);
// Feature tokenTypeFeature = ts.getFeature(TOKEN_TYPE_FEAT);
// Feature startFeature = ts.getFeature(CAS.START_FEAT);
// Feature endFeature = ts.getFeature(CAS.END_FEAT);
// System.out.println("\nCreating sentence annotations.");
// Get a handle to the index repository.
FSIndexRepository indexRepository = cas.getIndexRepository();
// assert(indexRepository != null);
Iterator<String> labelIt = indexRepository.getLabels();
assertTrue(labelIt != null);
// Get the standard index for tokens.
FSIndex<AnnotationFS> tokenIndex = cas.getAnnotationIndex(tokenType);
// assert(tokenIndex != null);
// Get an iterator over tokens.
FSIterator<AnnotationFS> it = tokenIndex.iterator();
// assert(it != null);
// Now create sentences. We do this as follows: a sentence starts where
// the first token after an EOS starts, and ends with an EOS.
long time = System.currentTimeMillis();
int endOfSentenceCounter = 0;
it.moveToFirst();
boolean lookForStart = true;
int start = 0, end; // Initialize start to pacify compiler.
FeatureStructure tokenFS, sentFS;
while (it.isValid()) {
if (lookForStart) {
// If we're looking for the start of a sentence, just grab the start
// of the current FS.
start = it.get().getIntValue(startFeature);
lookForStart = false;
} else {
// Check if we've reached the end of a sentence.
tokenFS = it.get();
if (tokenFS.getFeatureValue(tokenTypeFeature).getType() == eosType) {
end = tokenFS.getIntValue(endFeature);
sentFS = cas.createFS(sentenceType);
sentFS.setIntValue(startFeature, start);
sentFS.setIntValue(endFeature, end);
cas.getIndexRepository().addFS(sentFS);
++endOfSentenceCounter;
lookForStart = true;
}
}
it.moveToNext();
}
time = System.currentTimeMillis() - time;
// System.out.println("Created " + endOfSentenceCounter + " sentences: " + new TimeSpan(time));
}
// Check results.
private void checkSentences() {
TypeSystem ts = cas.getTypeSystem();
Type localSentenceType = ts.getType(SENT_TYPE);
// Feature tokenTypeFeature = ts.getFeatureByFullName(TOKEN_TYPE_FEAT);
// Feature startFeature = ts.getFeatureByFullName(CAS.FEATURE_BASE_NAME_BEGIN);
// Feature endFeature = ts.getFeatureByFullName(CAS.FEATURE_BASE_NAME_END);
// Print the first few sentences.
// System.out.println("\nThe first 10 sentences:\n");
FSIndex<AnnotationFS> sentenceIndex = cas.getAnnotationIndex(localSentenceType);
FSIterator<AnnotationFS> it = sentenceIndex.iterator();
AnnotationFS sentFS;
if (it.isValid()) {
sentFS = (AnnotationFS) it.get();
assertTrue(sentFS.getCoveredText() != null);
}
// int counter = 0;
String text = cas.getDocumentText();
assertTrue(text != null);
// while (it.isValid() && counter < 10) {
// sentFS = (AnnotationFS)it.get();
// System.out.println(
// "Sentence: "
// + sentFS.getCoveredText());
// it.moveToNext();
// ++counter;
// }
// Now get an iterator over all annotations.
FSIndex<AnnotationFS> annotIndex = cas.getAnnotationIndex();
// System.out.println("\nNumber of annotations in index: " + annotIndex.size());
// Print the first few sentences.
// System.out.println("The first 50 annotations:\n");
it = annotIndex.iterator();
// assert(it.isValid());
// counter = 0;
// AnnotationFS fs;
// while (it.isValid() && counter < 50) {
// fs = (AnnotationFS)it.get();
// System.out.print(fs.getType().getName() + ": ");
// if (fs.getType().getName().equals(CASMgr.DOCUMENT_TYPE)) {
// // When we see the document, we don't print the whole text ;-)
// System.out.println("...");
// } else {
// System.out.println(
// fs.getCoveredText());
// }
// it.moveToNext();
// ++counter;
// }
}
// private static String file2String(String file) throws IOException {
// return file2String(new File(file));
// }
/**
* Read the contents of a file into a string, using the default platform encoding.
*
* @param file
* The file to be read in.
* @return String The contents of the file.
* @throws IOException
* Various I/O errors.
*/
public static String file2String(File file) throws IOException {
// Read the file into a string using a char buffer.
FileReader reader = null;
int bufSize = (int) file.length(); // length in bytes >= length in chars due to encoding
char[] buf = new char[bufSize];
int read_so_far = 0;
try {
reader = new FileReader(file);
while (read_so_far < bufSize) {
int count = reader.read(buf, read_so_far, bufSize - read_so_far);
if (count < 0) {
break;
}
read_so_far += count;
}
} finally {
if (null != reader)
reader.close();
}
return new String(buf, 0, read_so_far);
}
/**
* Test driver.
*/
public void testMain() throws Exception {
// Read the document into a String. I'm sure there are better ways to
// do this.
File textFile = JUnitExtension.getFile("data/moby.txt");
String moby = file2String(textFile);
// String moby = file2String(System.getProperty("cas.data.test") + "moby.txt");
String line;
BufferedReader br = new BufferedReader(new StringReader(moby));
StringBuffer buf = new StringBuffer();
List<String> docs = new ArrayList<String>();
while ((line = br.readLine()) != null) {
if (line.startsWith(".. <p")) {
docs.add(buf.toString());
buf = new StringBuffer();
} else {
buf.append(line + "\n");
}
}
docs.add(buf.toString());
buf = null;
final int numDocs = docs.size();
final int max = 30;
int docCount = 0;
long overallTime = System.currentTimeMillis();
int numTok, numSent;
CASSerializer cs;
while (docCount < max) {
for (int i = 0; i < numDocs && docCount < max; i++) {
// System.out.println("Processing document: " + i);
// Set document text in first CAS.
cas.setDocumentText((String) docs.get(i));
tokenize();
numTok = cas.getAnnotationIndex(tokenType).size();
assertTrue(numTok > 0);
// System.out.println(" Number of tokens: " + numTok);
// System.out.println("Serializing...");
cs = Serialization.serializeNoMetaData(cas);
cas = Serialization.createCAS(casMgr, cs);
assertTrue(numTok == cas.getAnnotationIndex(tokenType).size());
createSentences();
numSent = cas.getAnnotationIndex(sentenceType).size();
assertTrue(numSent > 0);
// System.out.println(" Number of sentences: " + numSent);
// System.out.println("Serializing...");
cs = Serialization.serializeNoMetaData(cas);
cas = Serialization.createCAS(casMgr, cs);
assertTrue(numTok == cas.getAnnotationIndex(tokenType).size());
assertTrue(numSent == cas.getAnnotationIndex(sentenceType).size());
// System.out.println(" Number of tokens: " + numTok);
checkSentences();
// System.out.println("Serializing...");
cs = Serialization.serializeNoMetaData(cas);
cas = Serialization.createCAS(casMgr, cs);
assertTrue(numTok == cas.getAnnotationIndex(tokenType).size());
assertTrue(numSent == cas.getAnnotationIndex(sentenceType).size());
// System.out.println(" Verify: " + numTok + " tokens, " + numSent + " sentences.");
casMgr.reset();
++docCount;
}
// System.out.println("Number of documents processed: " + docCount);
}
overallTime = System.currentTimeMillis() - overallTime;
// System.out.println("Time taken over all: " + new TimeSpan(overallTime));
}
public static void main(String[] args) {
junit.textui.TestRunner.run(SerializationNoMDTest.class);
}
}