blob: e124230e8caef9096ae87f4597c5199d551746c6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.nlp.json;
import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Set;
import java.util.Map.Entry;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Sentence;
import org.apache.stanbol.enhancer.nlp.model.Span;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.morpho.Case;
import org.apache.stanbol.enhancer.nlp.morpho.CaseTag;
import org.apache.stanbol.enhancer.nlp.morpho.Definitness;
import org.apache.stanbol.enhancer.nlp.morpho.Gender;
import org.apache.stanbol.enhancer.nlp.morpho.GenderTag;
import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
import org.apache.stanbol.enhancer.nlp.morpho.NumberFeature;
import org.apache.stanbol.enhancer.nlp.morpho.NumberTag;
import org.apache.stanbol.enhancer.nlp.morpho.Person;
import org.apache.stanbol.enhancer.nlp.morpho.Tense;
import org.apache.stanbol.enhancer.nlp.morpho.TenseTag;
import org.apache.stanbol.enhancer.nlp.morpho.VerbMood;
import org.apache.stanbol.enhancer.nlp.morpho.VerbMoodTag;
import org.apache.stanbol.enhancer.nlp.ner.NerTag;
import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.pos.Pos;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class AnalyzedTextSerializerAndParserTest {
private static Logger log = LoggerFactory.getLogger(AnalyzedTextSerializerAndParserTest.class);
public static final String text = "The Stanbol enhancer can detect famous " +
"cities such as Paris and people such as Bob Marley.";
public static final Annotation<Number> testAnnotation =
new Annotation<Number>("test", Number.class);
/* -----
* Test data creates within the BeforeClass
* -----
*/
/**
* AnalysedText instance filled in {@link #setup()} with test dats
*/
private static AnalysedText analysedTextWithData;
private static LinkedHashMap<Sentence,String> expectedSentences = new LinkedHashMap<Sentence,String>();
private static LinkedHashMap<Chunk,String> expectedChunks = new LinkedHashMap<Chunk,String>();
private static LinkedHashMap<Token,String> expectedTokens = new LinkedHashMap<Token,String>();
/* -----
* Test data creates before every single test
* -----
*/
/**
* Empty AnalysedText instance created before each test
*/
private static AnalysedText at;
private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
private static final AnalysedTextFactory atFactory = AnalysedTextFactory.getDefaultInstance();
private static ContentItem ci;
private static Entry<UriRef,Blob> textBlob;
@BeforeClass
public static final void setup() throws IOException {
ci = ciFactory.createContentItem(new StringSource(text));
textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
analysedTextWithData = createAnalysedText();
int sentence = text.indexOf('.')+1;
Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " +
"cities such as Paris and people such as Bob Marley.");
Token the = sent1.addToken(0, 3);
expectedTokens.put(the, "The");
the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
new PosTag("PREP",Pos.Preposition), 0.85));
Token stanbol = sent1.addToken(4,11);
expectedTokens.put(stanbol, "Stanbol");
stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
new PosTag("PN", Pos.ProperNoun),0.95));
stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(
0.5));
//use index to create Tokens
int enhancerStart = sent1.getSpan().toString().indexOf("enhancer");
Token enhancer = sent1.addToken(enhancerStart,enhancerStart+"enhancer".length());
expectedTokens.put(enhancer, "enhancer");
enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
new PosTag("PN", Pos.ProperNoun),0.95));
enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
new PosTag("N", LexicalCategory.Noun),0.87));
MorphoFeatures morpho = new MorphoFeatures("enhance");
morpho.addCase(new CaseTag("test-case-1",Case.Comitative));
morpho.addCase(new CaseTag("test-case-2",Case.Abessive));
morpho.addDefinitness(Definitness.Definite);
morpho.addPerson(Person.First);
morpho.addPos(new PosTag("PN", Pos.ProperNoun));
morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
morpho.addTense(new TenseTag("test-tense", Tense.Present));
morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));
//create a chunk
Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(
new NerTag("organization", DBPEDIA_ORGANISATION)));
stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(
new PhraseTag("NP", LexicalCategory.Noun),0.98));
}
@Before
public void initAnalysedText() throws Exception {
at = createAnalysedText();
}
/**
* @throws IOException
*/
private static AnalysedText createAnalysedText() throws IOException {
return atFactory.createAnalysedText(textBlob.getValue());
}
@Test
public void testSerialization() throws IOException {
ByteArrayOutputStream bout = new ByteArrayOutputStream();
AnalyzedTextSerializer serializer = AnalyzedTextSerializer.getDefaultInstance();
serializer.serialize(analysedTextWithData, bout, null);
//get the serialized String and check for some expected elements
byte[] data = bout.toByteArray();
String serialized = new String(data,Charset.forName("UTF-8"));
log.info(serialized);
Assert.assertTrue(serialized.contains("\"spans\" : [ {"));
Assert.assertTrue(serialized.contains("\"type\" : \"Text\""));
Assert.assertTrue(serialized.contains("\"type\" : \"Sentence\""));
Assert.assertTrue(serialized.contains("\"type\" : \"Token\""));
Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.pos\" : {"));
Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.pos.PosTag\""));
Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.ner\" : {"));
Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.ner.NerTag\""));
Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.morpho\" : {"));
Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures\""));
//deserialize
AnalyzedTextParser parser = AnalyzedTextParser.getDefaultInstance();
AnalysedText parsedAt = parser.parse(new ByteArrayInputStream(data), null,
atFactory.createAnalysedText(textBlob.getValue()));
Assert.assertEquals(analysedTextWithData, parsedAt);
Iterator<Span> origSpanIt = analysedTextWithData.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
Iterator<Span> parsedSpanIt = parsedAt.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
while(origSpanIt.hasNext() && parsedSpanIt.hasNext()){
Span orig = origSpanIt.next();
Span parsed = parsedSpanIt.next();
Assert.assertEquals(orig, parsed);
Set<String> origKeys = orig.getKeys();
Set<String> parsedKeys = parsed.getKeys();
Assert.assertEquals(origKeys, parsedKeys);
for(String key : origKeys){
List<Value<?>> origValues = orig.getValues(key);
List<Value<?>> parsedValues = parsed.getValues(key);
Assert.assertEquals(origValues, parsedValues);
}
}
Assert.assertFalse("Original AnalyzedText MUST NOT have additional Spans",origSpanIt.hasNext());
Assert.assertFalse("Parsed AnalyzedText MUST NOT have additional Spans",parsedSpanIt.hasNext());
}
}