blob: c1a7eb34f5bd9f6db077e1666aae262d46626672 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.nlp.model;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The Class added as ContentPart to the contentItem
* @author westei
*
*/
public class AnalysedTextTest {
private static Logger log = LoggerFactory.getLogger(AnalysedTextTest.class);
public static final String text = "The Stanbol enhancer can detect famous " +
"cities such as Paris and people such as Bob Marley. With " +
"disambiguation it would even be able to detect the Comedian " +
"Bob Marley trafeling to Paris in Texas.";
public static final Annotation<Number> testAnnotation =
new Annotation<Number>("test", Number.class);
/* -----
* Test data creates within the BeforeClass
* -----
*/
/**
* AnalysedText instance filled in {@link #setup()} with test dats
*/
private static AnalysedText analysedTextWithData;
private static LinkedHashMap<Sentence,String> expectedSentences = new LinkedHashMap<Sentence,String>();
private static LinkedHashMap<Chunk,String> expectedChunks = new LinkedHashMap<Chunk,String>();
private static LinkedHashMap<Token,String> expectedTokens = new LinkedHashMap<Token,String>();
/* -----
* Test data creates before every single test
* -----
*/
/**
* Empty AnalysedText instance created before each test
*/
private static AnalysedText at;
private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
private static final AnalysedTextFactory atFactory = AnalysedTextFactory.getDefaultInstance();
private static ContentItem ci;
@BeforeClass
public static final void setup() throws IOException {
analysedTextWithData = createAnalysedText();
int sentence = text.indexOf('.')+1;
Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " +
"cities such as Paris and people such as Bob Marley.");
Sentence sent2 = analysedTextWithData.addSentence(sentence+1, text.length());
expectedSentences.put(sent2, "With disambiguation it would even be able " +
"to detect the Comedian Bob Marley trafeling to Paris in Texas.");
Token the = sent1.addToken(0, 3);
expectedTokens.put(the, "The");
Token stanbol = sent1.addToken(4,11);
expectedTokens.put(stanbol, "Stanbol");
//use index to create Tokens
int enhancerStart = sent1.getSpan().toString().indexOf("enhancer");
Token enhancer = sent1.addToken(enhancerStart,enhancerStart+"enhancer".length());
expectedTokens.put(enhancer, "enhancer");
//create a chunk
Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
int parisStart = sent1.getSpan().toString().indexOf("Paris");
Token paris = sent1.addToken(parisStart, parisStart+5);
expectedTokens.put(paris, "Paris");
int bobMarleyStart = sent1.getSpan().toString().indexOf("Bob Marley");
Chunk bobMarley = sent1.addChunk(bobMarleyStart, bobMarleyStart+10);
expectedChunks.put(bobMarley, "Bob Marley");
Token bob = bobMarley.addToken(0, 3);
expectedTokens.put(bob, "Bob");
Token marley = bobMarley.addToken(4, 10);
expectedTokens.put(marley, "Marley");
Token with = sent2.addToken(0, 4);
expectedTokens.put(with, "With");
Token disambiguation = sent2.addToken(5, 5+"disambiguation".length());
expectedTokens.put(disambiguation, "disambiguation");
int comedianBobMarleyIndex = sent2.getSpan().toString().indexOf("Comedian");
Chunk comedianBobMarley = sent2.addChunk(comedianBobMarleyIndex,
comedianBobMarleyIndex+"Comedian Bob Marley".length());
expectedChunks.put(comedianBobMarley, "Comedian Bob Marley");
Token comedian = comedianBobMarley.addToken(0, "Comedian".length());
expectedTokens.put(comedian, "Comedian");
Token bobSent2 = comedianBobMarley.addToken(9,9+"Bob".length());
expectedTokens.put(bobSent2, "Bob");
Token marleySent2 = comedianBobMarley.addToken(13, 13+"Marley".length());
expectedTokens.put(marleySent2, "Marley");
int parisIndex = sent2.getSpan().toString().indexOf("Paris");
Chunk parisInTexas = sent2.addChunk(parisIndex, parisIndex+"Paris in Texas".length());
expectedChunks.put(parisInTexas, "Paris in Texas");
Token parisSent2 = parisInTexas.addToken(0, "Paris".length());
expectedTokens.put(parisSent2, "Paris");
int inIndex = parisInTexas.getSpan().indexOf("in");
Token in = parisInTexas.addToken(inIndex,
inIndex+2);
expectedTokens.put(in, "in");
Token texasSent2 = parisInTexas.addToken(parisInTexas.getSpan().toString().indexOf("Texas"),
parisInTexas.getSpan().toString().indexOf("Texas")+"Texas".length());
expectedTokens.put(texasSent2, "Texas");
}
@Before
public void initAnalysedText() throws Exception {
at = createAnalysedText();
}
/**
* @throws IOException
*/
private static AnalysedText createAnalysedText() throws IOException {
ci = ciFactory.createContentItem(new StringSource(text));
Entry<UriRef,Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
return atFactory.createAnalysedText(ci, textBlob.getValue());
}
@Test
public void testSpanFilter(){
Iterator<Sentence> sentences = analysedTextWithData.getSentences();
Iterator<Chunk> chunks = analysedTextWithData.getChunks();
Iterator<Token> tokens = analysedTextWithData.getTokens();
for(Entry<Sentence,String> sentEntry : expectedSentences.entrySet()){
Sentence sent = sentences.next();
Assert.assertEquals(sentEntry.getKey(), sent);
Assert.assertEquals(sentEntry.getValue(), sent.getSpan().toString());
}
for(Entry<Chunk,String> chunkEntry : expectedChunks.entrySet()){
Chunk chunk = chunks.next();
Assert.assertEquals(chunkEntry.getKey(), chunk);
Assert.assertEquals(chunkEntry.getValue(), chunk.getSpan().toString());
}
for(Entry<Token,String> tokenEntry : expectedTokens.entrySet()){
Token token = tokens.next();
Assert.assertEquals(tokenEntry.getKey(), token);
Assert.assertEquals(tokenEntry.getValue(), token.getSpan().toString());
}
}
@Test
public void testAnalysedText(){
Assert.assertEquals(text, at.getText());
Assert.assertEquals(text, at.getSpan());
Assert.assertEquals(0, at.getStart());
Assert.assertEquals(text.length(), at.getEnd());
}
/**
* Spans created relative to an other MUST NOT exceed the span of the
* other one
*/
@Test(expected=IllegalArgumentException.class)
public void testExceedsRelativeSpan(){
Sentence sent = at.addSentence(0, 10);
sent.addChunk(5, 15); //Invalid
}
@Test(expected=IllegalArgumentException.class)
public void testNegativeStart(){
at.addSentence(-1, 10);
}
@Test(expected=IllegalArgumentException.class)
public void testRelativeNegativeStart(){
Sentence sent = at.addSentence(0, 10);
sent.addToken(-1, 5);
}
@Test
public void testAnalysedTextaddSpanMethods(){
Collection<Span> spans = new HashSet<Span>();
//add some span of different types
spans.add(at.addToken(4, 11));
spans.add(at.addChunk(4,19));
spans.add(at.addSentence(0, 91));
Set<Span> atSpans = AnalysedTextUtils.asSet(at.getEnclosed(EnumSet.allOf(SpanTypeEnum.class)));
Assert.assertTrue(spans.containsAll(atSpans));
Assert.assertTrue(atSpans.containsAll(spans));
}
/**
* Test relative additions (with relative indexes) as well as iterators
* over this hierarchy
*/
@Test
public void testSpanHierarchy(){
int[] startPos = new int[]{0,1,2};
int[] endPos = new int[]{1,2,3};
int maxVal = endPos[endPos.length-1];
int tokenLength = 5;
int chunkLength = tokenLength*maxVal;
int sentenceLength = tokenLength*maxVal*maxVal;
List<Sentence> sentences = new ArrayList<Sentence>(startPos.length);
List<Chunk> chunks = new ArrayList<Chunk>(startPos.length*2);
List<Token> tokens = new ArrayList<Token>(startPos.length*3);
int start;
int end;
//1. test relative add and absolute start/end
log.info("--- adding Spans ---");
for(int s=0;s<startPos.length;s++){
start = startPos[s]*sentenceLength;
end = endPos[s]*sentenceLength;
Sentence sent = at.addSentence(start, end);
log.info("add {}",sent);
Assert.assertEquals(start, sent.getStart());
Assert.assertEquals(end, sent.getEnd());
sentences.add(sent);
}
//1.b iterate over the sentences while adding Chunks and Tokens to
// test that returned Iterators MUST NOT throw
// ConcurrentModificationExceptions when adding Spans to the AnalysedText
Iterator<Sentence> sentenceIt = at.getSentences();
while(sentenceIt.hasNext()){
Sentence sent = sentenceIt.next();
for(int c=0;c<startPos.length;c++){
start = startPos[c]*chunkLength;
end = endPos[c]*chunkLength;
Chunk chunk = sent.addChunk(start, end);
log.info(" add {}",chunk);
start = sent.getStart() + start;
end = sent.getStart() + end;
Assert.assertEquals(start, chunk.getStart());
Assert.assertEquals(end, chunk.getEnd());
chunks.add(chunk);
for(int t=0;t<startPos.length;t++){
start = startPos[t]*tokenLength;
end = endPos[t]*tokenLength;
Token token = chunk.addToken(start, end);
log.info(" add {}",token);
start = chunk.getStart() + start;
end = chunk.getStart() + end;
Assert.assertEquals(start, token.getStart());
Assert.assertEquals(end, token.getEnd());
tokens.add(token);
}
}
}
//2. test iterations of enclosed
int chunksInSentence = startPos.length;
int tokensInChunk = chunksInSentence;
int tokensInSentence = chunksInSentence*tokensInChunk;
Iterator<Sentence> sentIt = at.getSentences();
int s = 0;
int c = 0;
int t = 0;
log.info("--- iterating over Spans ---");
log.info("{}",at);
for(;sentIt.hasNext();s++){
Assert.assertTrue(sentences.size()+" Sentences Expected (found: "+(s+1)+")",s < sentences.size());
Sentence sent = sentIt.next();
log.info(" {}",sent);
Assert.assertEquals(sentences.get(s), sent);
Iterator<Chunk> chunkIt = sent.getChunks();
int foundChunks = 0;
for(;chunkIt.hasNext();c++){
Assert.assertTrue(chunks.size()+" Chunks Expected (found: "+(c+1)+")",c < chunks.size());
Chunk chunk = chunkIt.next();
log.info(" {}",chunk);
Assert.assertEquals(chunks.get(c), chunk);
Iterator<Token> tokenIt = chunk.getTokens();
int foundTokens = 0;
for(;tokenIt.hasNext();t++){
Assert.assertTrue(tokens.size()+" Tokens Expected (found: "+(t+1)+")",t < tokens.size());
Token token = tokenIt.next();
log.info(" {}",token);
Assert.assertEquals(tokens.get(t), token);
foundTokens++;
}
Assert.assertEquals(tokensInChunk+" Tokens expected in Chunk", tokensInChunk,foundTokens);
foundChunks++;
}
Assert.assertEquals(chunksInSentence+" Chunks expected in Sentence", chunksInSentence,foundChunks);
//also iterate over tokens within a sentence
log.info(" {}",sent);
Iterator<Token> tokenIt = sent.getTokens();
int foundTokens = 0;
for(;tokenIt.hasNext();foundTokens++){
Token token = tokenIt.next();
log.info(" {}",token);
Assert.assertEquals(tokens.get(s*tokensInSentence+foundTokens), token);
}
Assert.assertEquals(tokensInSentence+" Tokens expected in Sentence", tokensInSentence,foundTokens);
}
Assert.assertEquals(sentences.size()+" Sentences Expected (found: "+s+")", sentences.size(),s);
Assert.assertEquals(chunks.size()+" Chunks Expected (found: "+c+")", chunks.size(),c);
Assert.assertEquals(tokens.size()+" Sentences Expected (found: "+t+")", tokens.size(),t);
//also iterate over Chunks in AnalysedText
Iterator<Chunk> chunkIt = at.getChunks();
int foundChunks = 0;
log.info("{}",at);
for(;chunkIt.hasNext();foundChunks++){
Chunk chunk = chunkIt.next();
log.info(" {}",chunk);
Assert.assertEquals(chunks.get(foundChunks), chunk);
}
Assert.assertEquals(chunks.size()+" Chunks expected in AnalysedText", chunks.size(),foundChunks);
//also iterate over Tokens in AnalysedText
Iterator<Token> tokenIt = at.getTokens();
int foundTokens = 0;
log.info("{}",at);
for(;tokenIt.hasNext();foundTokens++){
Token token = tokenIt.next();
log.info(" {}",token);
Assert.assertEquals(tokens.get(foundTokens), token);
}
Assert.assertEquals(tokens.size()+" Tokens expected in AnalysedText", tokens.size(),foundTokens);
//Finally iterate over multiple token types
Iterator<Span> sentencesAndChunks = at.getEnclosed(
EnumSet.of(SpanTypeEnum.Sentence,SpanTypeEnum.Chunk));
s=0;
c=0;
log.info("{} >> Iterate over Sentences and Chunks",at);
while(sentencesAndChunks.hasNext()){
Span span = sentencesAndChunks.next();
log.info(" {}",span);
if(span.getType() == SpanTypeEnum.Chunk){
Assert.assertEquals(chunks.get(c), span);
c++;
} else if(span.getType() == SpanTypeEnum.Sentence){
Assert.assertEquals(sentences.get(s), span);
s++;
} else {
Assert.fail("Unexpected SpanType '"+span.getType()+" (Span: "+span.getClass()+")");
}
}
Assert.assertEquals(sentences.size()+" Sentences expected in AnalysedText", sentences.size(),s);
Assert.assertEquals((sentences.size()*chunksInSentence)+" Chunks expected in AnalysedText",
(sentences.size()*chunksInSentence),c);
}
@Test
public void testAnnotation(){
List<Value<Number>> values = new ArrayList<Value<Number>>();
values.add(new Value<Number>(26,0.6));
values.add(new Value<Number>(27l));
values.add(new Value<Number>(28.0f));
values.add(new Value<Number>(25.0,0.8));
at.addAnnotations(testAnnotation, values);
Value<Number> value = at.getAnnotation(testAnnotation);
Assert.assertNotNull(value);
Assert.assertEquals(Double.valueOf(25.0), value.value());
Assert.assertEquals(0.8d, value.probability(), 0.0d);
Number prev = Float.valueOf(24f);
for(Value<Number> v : at.getAnnotations(testAnnotation)){
Assert.assertNotNull(v);
Assert.assertTrue(v.value().doubleValue() > prev.doubleValue());
prev = v.value();
}
//check that the order of Annotations without probability is kept
at.addAnnotation(testAnnotation, new Value<Number>(29));
prev = Integer.valueOf(24);
for(Value<Number> v : at.getAnnotations(testAnnotation)){
Assert.assertNotNull(v);
Assert.assertTrue(v.value().intValue() > prev.intValue());
prev = v.value();
}
}
}