enhancer/generic/nlp-json/src/test/java/org/apache/stanbol/enhancer/nlp/json/AnalyzedTextSerializerAndParserTest.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.nlp.json;

 import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION;

 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.nio.charset.Charset;
 import java.util.Collections;
 import java.util.EnumSet;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Set;
 import java.util.Map.Entry;

 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
 import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
 import org.apache.stanbol.enhancer.nlp.model.Chunk;
 import org.apache.stanbol.enhancer.nlp.model.Sentence;
 import org.apache.stanbol.enhancer.nlp.model.Span;
 import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
 import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
 import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
 import org.apache.stanbol.enhancer.nlp.morpho.Case;
 import org.apache.stanbol.enhancer.nlp.morpho.CaseTag;
 import org.apache.stanbol.enhancer.nlp.morpho.Definitness;
 import org.apache.stanbol.enhancer.nlp.morpho.Gender;
 import org.apache.stanbol.enhancer.nlp.morpho.GenderTag;
 import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
 import org.apache.stanbol.enhancer.nlp.morpho.NumberFeature;
 import org.apache.stanbol.enhancer.nlp.morpho.NumberTag;
 import org.apache.stanbol.enhancer.nlp.morpho.Person;
 import org.apache.stanbol.enhancer.nlp.morpho.Tense;
 import org.apache.stanbol.enhancer.nlp.morpho.TenseTag;
 import org.apache.stanbol.enhancer.nlp.morpho.VerbMood;
 import org.apache.stanbol.enhancer.nlp.morpho.VerbMoodTag;
 import org.apache.stanbol.enhancer.nlp.ner.NerTag;
 import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
 import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
 import org.apache.stanbol.enhancer.nlp.pos.Pos;
 import org.apache.stanbol.enhancer.nlp.pos.PosTag;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
 import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public class AnalyzedTextSerializerAndParserTest {

     private static Logger log = LoggerFactory.getLogger(AnalyzedTextSerializerAndParserTest.class);

     public static final String text = "The Stanbol enhancer can detect famous " +
             "cities such as Paris and people such as Bob Marley.";

     public static final Annotation<Number> testAnnotation =
             new Annotation<Number>("test", Number.class);

     /* -----
      * Test data creates within the BeforeClass
      * -----
      */
     /**
      * AnalysedText instance filled in {@link #setup()} with test dats
      */
     private static AnalysedText analysedTextWithData;
     private static LinkedHashMap<Sentence,String> expectedSentences = new LinkedHashMap<Sentence,String>();
     private static LinkedHashMap<Chunk,String> expectedChunks = new LinkedHashMap<Chunk,String>();
     private static LinkedHashMap<Token,String> expectedTokens = new LinkedHashMap<Token,String>();

     /* -----
      * Test data creates before every single test
      * -----
      */
     /**
      * Empty AnalysedText instance created before each test
      */
     private static AnalysedText at;

     private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
     private static final AnalysedTextFactory atFactory = AnalysedTextFactory.getDefaultInstance();

     private static ContentItem ci;

     private static Entry<UriRef,Blob> textBlob;

     @BeforeClass
     public static final void setup() throws IOException {
         ci = ciFactory.createContentItem(new StringSource(text));
         textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
         analysedTextWithData = createAnalysedText();
         int sentence = text.indexOf('.')+1;
         Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
         expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " +
             "cities such as Paris and people such as Bob Marley.");

         Token the = sent1.addToken(0, 3);
         expectedTokens.put(the, "The");
         the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
             new PosTag("PREP",Pos.Preposition), 0.85));

         Token stanbol = sent1.addToken(4,11);
         expectedTokens.put(stanbol, "Stanbol");
         stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
             new PosTag("PN", Pos.ProperNoun),0.95));
         stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(
             0.5));

         //use index to create Tokens
         int enhancerStart = sent1.getSpan().toString().indexOf("enhancer");
         Token enhancer = sent1.addToken(enhancerStart,enhancerStart+"enhancer".length());
         expectedTokens.put(enhancer, "enhancer");
         enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
             new PosTag("PN", Pos.ProperNoun),0.95));
         enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
             new PosTag("N", LexicalCategory.Noun),0.87));
         MorphoFeatures morpho = new MorphoFeatures("enhance");
         morpho.addCase(new CaseTag("test-case-1",Case.Comitative));
         morpho.addCase(new CaseTag("test-case-2",Case.Abessive));
         morpho.addDefinitness(Definitness.Definite);
         morpho.addPerson(Person.First);
         morpho.addPos(new PosTag("PN", Pos.ProperNoun));
         morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
         morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
         morpho.addTense(new TenseTag("test-tense", Tense.Present));
         morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
         enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));

         //create a chunk
         Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
         expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
         stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(
             new NerTag("organization", DBPEDIA_ORGANISATION)));
         stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(
             new PhraseTag("NP", LexicalCategory.Noun),0.98));

     }
     @Before
     public void initAnalysedText() throws Exception {
         at = createAnalysedText();
     }
     /**
      * @throws IOException
      */
     private static AnalysedText createAnalysedText() throws IOException {
         return  atFactory.createAnalysedText(textBlob.getValue());
     }

     @Test
     public void testSerialization() throws IOException {
         ByteArrayOutputStream bout = new ByteArrayOutputStream();
         AnalyzedTextSerializer serializer = AnalyzedTextSerializer.getDefaultInstance();
         serializer.serialize(analysedTextWithData, bout, null);
         //get the serialized String and check for some expected elements
         byte[] data = bout.toByteArray();
         String serialized = new String(data,Charset.forName("UTF-8"));
         log.info(serialized);
         Assert.assertTrue(serialized.contains("\"spans\" : [ {"));
         Assert.assertTrue(serialized.contains("\"type\" : \"Text\""));
         Assert.assertTrue(serialized.contains("\"type\" : \"Sentence\""));
         Assert.assertTrue(serialized.contains("\"type\" : \"Token\""));
         Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.pos\" : {"));
         Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.pos.PosTag\""));
         Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.ner\" : {"));
         Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.ner.NerTag\""));
         Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.morpho\" : {"));
         Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures\""));
         //deserialize
         AnalyzedTextParser parser = AnalyzedTextParser.getDefaultInstance();
         AnalysedText parsedAt = parser.parse(new ByteArrayInputStream(data), null,
             atFactory.createAnalysedText(textBlob.getValue()));
         Assert.assertEquals(analysedTextWithData, parsedAt);
         Iterator<Span> origSpanIt = analysedTextWithData.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
         Iterator<Span> parsedSpanIt = parsedAt.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
         while(origSpanIt.hasNext() && parsedSpanIt.hasNext()){
             Span orig = origSpanIt.next();
             Span parsed = parsedSpanIt.next();
             Assert.assertEquals(orig, parsed);
             Set<String> origKeys = orig.getKeys();
             Set<String> parsedKeys = parsed.getKeys();
             Assert.assertEquals(origKeys, parsedKeys);
             for(String key : origKeys){
                 List<Value<?>> origValues = orig.getValues(key);
                 List<Value<?>> parsedValues = parsed.getValues(key);
                 Assert.assertEquals(origValues, parsedValues);
             }
         }
         Assert.assertFalse("Original AnalyzedText MUST NOT have additional Spans",origSpanIt.hasNext());
         Assert.assertFalse("Parsed AnalyzedText MUST NOT have additional Spans",parsedSpanIt.hasNext());
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.nlp.json;

	import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION;

	import java.io.ByteArrayInputStream;
	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.nio.charset.Charset;
	import java.util.Collections;
	import java.util.EnumSet;
	import java.util.Iterator;
	import java.util.LinkedHashMap;
	import java.util.List;
	import java.util.Set;
	import java.util.Map.Entry;

	import org.apache.clerezza.rdf.core.UriRef;
	import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
	import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
	import org.apache.stanbol.enhancer.nlp.model.Chunk;
	import org.apache.stanbol.enhancer.nlp.model.Sentence;
	import org.apache.stanbol.enhancer.nlp.model.Span;
	import org.apache.stanbol.enhancer.nlp.model.Token;
	import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
	import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
	import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
	import org.apache.stanbol.enhancer.nlp.morpho.Case;
	import org.apache.stanbol.enhancer.nlp.morpho.CaseTag;
	import org.apache.stanbol.enhancer.nlp.morpho.Definitness;
	import org.apache.stanbol.enhancer.nlp.morpho.Gender;
	import org.apache.stanbol.enhancer.nlp.morpho.GenderTag;
	import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
	import org.apache.stanbol.enhancer.nlp.morpho.NumberFeature;
	import org.apache.stanbol.enhancer.nlp.morpho.NumberTag;
	import org.apache.stanbol.enhancer.nlp.morpho.Person;
	import org.apache.stanbol.enhancer.nlp.morpho.Tense;
	import org.apache.stanbol.enhancer.nlp.morpho.TenseTag;
	import org.apache.stanbol.enhancer.nlp.morpho.VerbMood;
	import org.apache.stanbol.enhancer.nlp.morpho.VerbMoodTag;
	import org.apache.stanbol.enhancer.nlp.ner.NerTag;
	import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
	import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
	import org.apache.stanbol.enhancer.nlp.pos.Pos;
	import org.apache.stanbol.enhancer.nlp.pos.PosTag;
	import org.apache.stanbol.enhancer.servicesapi.Blob;
	import org.apache.stanbol.enhancer.servicesapi.ContentItem;
	import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
	import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
	import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
	import org.junit.Assert;
	import org.junit.Before;
	import org.junit.BeforeClass;
	import org.junit.Test;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	public class AnalyzedTextSerializerAndParserTest {

	private static Logger log = LoggerFactory.getLogger(AnalyzedTextSerializerAndParserTest.class);

	public static final String text = "The Stanbol enhancer can detect famous " +
	"cities such as Paris and people such as Bob Marley.";

	public static final Annotation<Number> testAnnotation =
	new Annotation<Number>("test", Number.class);

	/* -----
	* Test data creates within the BeforeClass
	* -----
	*/
	/**
	* AnalysedText instance filled in {@link #setup()} with test dats
	*/
	private static AnalysedText analysedTextWithData;
	private static LinkedHashMap<Sentence,String> expectedSentences = new LinkedHashMap<Sentence,String>();
	private static LinkedHashMap<Chunk,String> expectedChunks = new LinkedHashMap<Chunk,String>();
	private static LinkedHashMap<Token,String> expectedTokens = new LinkedHashMap<Token,String>();

	/* -----
	* Test data creates before every single test
	* -----
	*/
	/**
	* Empty AnalysedText instance created before each test
	*/
	private static AnalysedText at;

	private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
	private static final AnalysedTextFactory atFactory = AnalysedTextFactory.getDefaultInstance();

	private static ContentItem ci;

	private static Entry<UriRef,Blob> textBlob;

	@BeforeClass
	public static final void setup() throws IOException {
	ci = ciFactory.createContentItem(new StringSource(text));
	textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
	analysedTextWithData = createAnalysedText();
	int sentence = text.indexOf('.')+1;
	Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
	expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " +
	"cities such as Paris and people such as Bob Marley.");

	Token the = sent1.addToken(0, 3);
	expectedTokens.put(the, "The");
	the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
	new PosTag("PREP",Pos.Preposition), 0.85));

	Token stanbol = sent1.addToken(4,11);
	expectedTokens.put(stanbol, "Stanbol");
	stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
	new PosTag("PN", Pos.ProperNoun),0.95));
	stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(
	0.5));

	//use index to create Tokens
	int enhancerStart = sent1.getSpan().toString().indexOf("enhancer");
	Token enhancer = sent1.addToken(enhancerStart,enhancerStart+"enhancer".length());
	expectedTokens.put(enhancer, "enhancer");
	enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
	new PosTag("PN", Pos.ProperNoun),0.95));
	enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(
	new PosTag("N", LexicalCategory.Noun),0.87));
	MorphoFeatures morpho = new MorphoFeatures("enhance");
	morpho.addCase(new CaseTag("test-case-1",Case.Comitative));
	morpho.addCase(new CaseTag("test-case-2",Case.Abessive));
	morpho.addDefinitness(Definitness.Definite);
	morpho.addPerson(Person.First);
	morpho.addPos(new PosTag("PN", Pos.ProperNoun));
	morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
	morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
	morpho.addTense(new TenseTag("test-tense", Tense.Present));
	morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
	enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));

	//create a chunk
	Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
	expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
	stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(
	new NerTag("organization", DBPEDIA_ORGANISATION)));
	stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(
	new PhraseTag("NP", LexicalCategory.Noun),0.98));

	}
	@Before
	public void initAnalysedText() throws Exception {
	at = createAnalysedText();
	}
	/**
	* @throws IOException
	*/
	private static AnalysedText createAnalysedText() throws IOException {
	return atFactory.createAnalysedText(textBlob.getValue());
	}

	@Test
	public void testSerialization() throws IOException {
	ByteArrayOutputStream bout = new ByteArrayOutputStream();
	AnalyzedTextSerializer serializer = AnalyzedTextSerializer.getDefaultInstance();
	serializer.serialize(analysedTextWithData, bout, null);
	//get the serialized String and check for some expected elements
	byte[] data = bout.toByteArray();
	String serialized = new String(data,Charset.forName("UTF-8"));
	log.info(serialized);
	Assert.assertTrue(serialized.contains("\"spans\" : [ {"));
	Assert.assertTrue(serialized.contains("\"type\" : \"Text\""));
	Assert.assertTrue(serialized.contains("\"type\" : \"Sentence\""));
	Assert.assertTrue(serialized.contains("\"type\" : \"Token\""));
	Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.pos\" : {"));
	Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.pos.PosTag\""));
	Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.ner\" : {"));
	Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.ner.NerTag\""));
	Assert.assertTrue(serialized.contains("\"stanbol.enhancer.nlp.morpho\" : {"));
	Assert.assertTrue(serialized.contains("\"class\" : \"org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures\""));
	//deserialize
	AnalyzedTextParser parser = AnalyzedTextParser.getDefaultInstance();
	AnalysedText parsedAt = parser.parse(new ByteArrayInputStream(data), null,
	atFactory.createAnalysedText(textBlob.getValue()));
	Assert.assertEquals(analysedTextWithData, parsedAt);
	Iterator<Span> origSpanIt = analysedTextWithData.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
	Iterator<Span> parsedSpanIt = parsedAt.getEnclosed(EnumSet.allOf(SpanTypeEnum.class));
	while(origSpanIt.hasNext() && parsedSpanIt.hasNext()){
	Span orig = origSpanIt.next();
	Span parsed = parsedSpanIt.next();
	Assert.assertEquals(orig, parsed);
	Set<String> origKeys = orig.getKeys();
	Set<String> parsedKeys = parsed.getKeys();
	Assert.assertEquals(origKeys, parsedKeys);
	for(String key : origKeys){
	List<Value<?>> origValues = orig.getValues(key);
	List<Value<?>> parsedValues = parsed.getValues(key);
	Assert.assertEquals(origValues, parsedValues);
	}
	}
	Assert.assertFalse("Original AnalyzedText MUST NOT have additional Spans",origSpanIt.hasNext());
	Assert.assertFalse("Parsed AnalyzedText MUST NOT have additional Spans",parsedSpanIt.hasNext());
	}

	}