blob: fc5d79b3fedb2ae3dd5f884687705096191ac116 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.fit.testing.factory;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import java.io.File;
import java.io.FileInputStream;
import java.util.Collection;
import java.util.Iterator;
import org.apache.commons.io.IOUtils;
import org.apache.uima.UIMAException;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.ComponentTestBase;
import org.apache.uima.fit.type.Sentence;
import org.apache.uima.fit.type.Token;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.pear.util.FileUtil;
import org.junit.Test;
/**
*/
public class TokenBuilderTest extends ComponentTestBase {
@Test
public void test1() {
String text = "What if we built a rocket ship made of cheese?"
+ "We could fly it to the moon for repairs.";
tokenBuilder
.buildTokens(
jCas,
text,
"What if we built a rocket ship made of cheese ? \r\n We could fly it to the moon for repairs .",
"A B C D E F G H I J K L M N O P Q R S T U");
FSIndex<Annotation> sentenceIndex = jCas.getAnnotationIndex(Sentence.type);
assertEquals(2, sentenceIndex.size());
FSIterator<Annotation> sentences = sentenceIndex.iterator();
Sentence sentence = (Sentence) sentences.next();
assertEquals("What if we built a rocket ship made of cheese?", sentence.getCoveredText());
sentence = (Sentence) sentences.next();
assertEquals("We could fly it to the moon for repairs.", sentence.getCoveredText());
FSIndex<Annotation> tokenIndex = jCas.getAnnotationIndex(Token.type);
assertEquals(21, tokenIndex.size());
Token token = JCasUtil.selectByIndex(jCas, Token.class, 0);
testToken(token, "What", 0, 4, "A", null);
token = JCasUtil.selectByIndex(jCas, Token.class, 1);
testToken(token, "if", 5, 7, "B", null);
token = JCasUtil.selectByIndex(jCas, Token.class, 9);
testToken(token, "cheese", 39, 45, "J", null);
token = JCasUtil.selectByIndex(jCas, Token.class, 10);
testToken(token, "?", 45, 46, "K", null);
token = JCasUtil.selectByIndex(jCas, Token.class, 11);
testToken(token, "We", 46, 48, "L", null);
token = JCasUtil.selectByIndex(jCas, Token.class, 12);
testToken(token, "could", 49, 54, "M", null);
token = JCasUtil.selectByIndex(jCas, Token.class, 19);
testToken(token, "repairs", 78, 85, "T", null);
token = JCasUtil.selectByIndex(jCas, Token.class, 20);
testToken(token, ".", 85, 86, "U", null);
}
@Test
public void test2() {
String text = "What if we built a rocket ship made of cheese? \n"
+ "We could fly it to the moon for repairs.";
tokenBuilder
.buildTokens(
jCas,
text,
"What if we built a rocket ship made of cheese ? \n We could fly it to the moon for repairs .",
"A B C D E F G H I J K L M N O P Q R S T U");
Token token = JCasUtil.selectByIndex(jCas, Token.class, 10);
testToken(token, "?", 45, 46, "K", null);
token = JCasUtil.selectByIndex(jCas, Token.class, 11);
testToken(token, "We", 48, 50, "L", null);
jCas.reset();
text = "What if we built a rocket ship made of cheese? \n"
+ "We could fly it to the moon for repairs.";
tokenBuilder
.buildTokens(
jCas,
text,
"What if we built a rocket ship made of cheese ?\nWe could fly it to the moon for repairs .",
"A B C D E F G H I J K L M N O P Q R S T U");
token = JCasUtil.selectByIndex(jCas, Token.class, 10);
testToken(token, "?", 45, 46, "K", null);
token = JCasUtil.selectByIndex(jCas, Token.class, 11);
testToken(token, "We", 48, 50, "L", null);
}
@Test
public void test3() {
String text = "If you like line writer, then you should really check out line rider.";
tokenBuilder.buildTokens(jCas, text);
FSIndex<Annotation> tokenIndex = jCas.getAnnotationIndex(Token.type);
assertEquals(13, tokenIndex.size());
Token token = JCasUtil.selectByIndex(jCas, Token.class, 0);
testToken(token, "If", 0, 2, null, null);
token = JCasUtil.selectByIndex(jCas, Token.class, 12);
testToken(token, "rider.", 63, 69, null, null);
FSIndex<Annotation> sentenceIndex = jCas.getAnnotationIndex(Sentence.type);
assertEquals(1, sentenceIndex.size());
Sentence sentence = JCasUtil.selectByIndex(jCas, Sentence.class, 0);
assertEquals(text, sentence.getCoveredText());
}
private void testToken(Token token, String coveredText, int begin, int end, String partOfSpeech,
String stem) {
assertEquals(coveredText, token.getCoveredText());
assertEquals(begin, token.getBegin());
assertEquals(end, token.getEnd());
assertEquals(partOfSpeech, token.getPos());
assertEquals(stem, token.getStem());
}
@Test
public void testSpaceSplit() {
String[] splits = " asdf ".split(" ");
assertEquals(2, splits.length);
}
@Test
public void testBadInput() {
String text = "If you like line writer, then you should really check out line rider.";
IllegalArgumentException iae = null;
try {
tokenBuilder.buildTokens(jCas, text,
"If you like line rider, then you really don't need line writer");
} catch (IllegalArgumentException e) {
iae = e;
}
assertNotNull(iae);
}
@Test
public void testStems() {
String text = "Me and all my friends are non-conformists.";
tokenBuilder.buildTokens(jCas, text, "Me and all my friends are non - conformists .",
"M A A M F A N - C .", "me and all my friend are non - conformist .");
assertEquals("Me and all my friends are non-conformists.", jCas.getDocumentText());
Token friendToken = JCasUtil.selectByIndex(jCas, Token.class, 4);
assertEquals("friends", friendToken.getCoveredText());
assertEquals("F", friendToken.getPos());
assertEquals("friend", friendToken.getStem());
}
@Test
public void test4() {
String text = "a b-c de--fg h,i,j,k";
tokenBuilder.buildTokens(jCas, text, "a b - c d e - - f g h , i , j , k");
FSIterator<Annotation> tokens = jCas.getAnnotationIndex(Token.type).iterator();
int tokenCount = 0;
while (tokens.hasNext()) {
tokenCount++;
tokens.next();
}
assertEquals(17, tokenCount);
}
@Test
public void test5() throws Exception {
JCas myView = jCas.createView("MyView");
tokenBuilder.buildTokens(myView, "red and blue cars and tipsy motorcycles");
Token token = JCasUtil.selectByIndex(myView, Token.class, 6);
assertEquals("motorcycles", token.getCoveredText());
}
@Test
public void testNewlinesFromFile() throws Exception {
File unixNewlines = new File("src/test/resources/data/docs/unix-newlines.txt.bin");
assertEquals(55, unixNewlines.length());
byte[] unixNewlinesBytes = IOUtils.toByteArray(new FileInputStream(unixNewlines));
assertEquals('.', unixNewlinesBytes[13]);
assertEquals(0x0A, unixNewlinesBytes[14]);
assertEquals('s', unixNewlinesBytes[15]);
String text = FileUtil.loadTextFile(unixNewlines, "UTF-8");
text = text.substring(1); // remove "\uFEFF" character from beginning of text
tokenBuilder.buildTokens(jCas, text);
Collection<Sentence> sentences = JCasUtil.select(jCas, Sentence.class);
assertEquals(4, sentences.size());
Iterator<Sentence> iterator = sentences.iterator();
assertEquals("sentence 1.", iterator.next().getCoveredText());
assertEquals("sentence 2.", iterator.next().getCoveredText());
assertEquals("sentence 3.", iterator.next().getCoveredText());
assertEquals("sentence 4.", iterator.next().getCoveredText());
jCas.reset();
File windowsNewlines = new File("src/test/resources/data/docs/windows-newlines.txt.bin");
text = FileUtil.loadTextFile(windowsNewlines, "UTF-8");
assertEquals(65, windowsNewlines.length());
byte[] windowsNewlinesBytes = IOUtils.toByteArray(new FileInputStream(windowsNewlines));
assertEquals('.', windowsNewlinesBytes[13]);
assertEquals(0x0D, windowsNewlinesBytes[14]);
assertEquals(0x0A, windowsNewlinesBytes[15]);
assertEquals('s', windowsNewlinesBytes[16]);
text = text.substring(1); // remove "\uFEFF" character from beginning of text
tokenBuilder.buildTokens(jCas, text);
sentences = JCasUtil.select(jCas, Sentence.class);
assertEquals(4, sentences.size());
iterator = sentences.iterator();
assertEquals("sentence 1.", iterator.next().getCoveredText());
assertEquals("sentence 2.", iterator.next().getCoveredText());
assertEquals("sentence 3.", iterator.next().getCoveredText());
assertEquals("sentence 4.", iterator.next().getCoveredText());
}
}