blob: 094d4dfdc403c42c879179290c42dcb3606157e3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.nlp.json;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import javax.swing.JPanel;
import org.apache.commons.io.IOUtils;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.enhancer.nlp.json.valuetype.ValueTypeParser;
import org.apache.stanbol.enhancer.nlp.json.valuetype.ValueTypeParserRegistry;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
import org.apache.stanbol.enhancer.nlp.model.Span;
import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.JsonParseException;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.JsonToken;
import org.codehaus.jackson.io.SerializedString;
import org.codehaus.jackson.map.JsonMappingException;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.node.ArrayNode;
import org.codehaus.jackson.node.ObjectNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Component(immediate=true,policy=ConfigurationPolicy.IGNORE)
@Service(value=AnalyzedTextParser.class)
public class AnalyzedTextParser {
private final Logger log = LoggerFactory.getLogger(AnalyzedTextParser.class);
private final static Charset UTF8 = Charset.forName("UTF-8");
private static AnalyzedTextParser defaultInstance;
protected ObjectMapper mapper = new ObjectMapper();
/**
* Can be used when running outside of OSGI to obtain the default (singleton)
* instance.
* @return
*/
public static final AnalyzedTextParser getDefaultInstance(){
if(defaultInstance == null){
defaultInstance = new AnalyzedTextParser(
ValueTypeParserRegistry.getInstance());
}
return defaultInstance;
}
/**
* Default constructor used by OSGI
*/
public AnalyzedTextParser() {}
/**
* Constructs a new Parser instance for the parsed {@link ValueTypeParserRegistry}
* instance. Typically this constructor should not be used as usages within
* an OSGI environment MUST lookup the service via the service registry.
* Usages outside an OSGI environment should prefer to use the
* {@link #getDefaultInstance()} instance to obtain the singleton instance.
* @param vtsr
*/
public AnalyzedTextParser(ValueTypeParserRegistry vtpr){
if(vtpr == null){
throw new IllegalArgumentException("The parsed ValueTypeParserRegistry MUST NOT be NULL!");
}
this.valueTypeParserRegistry = vtpr;
}
@Reference
protected ValueTypeParserRegistry valueTypeParserRegistry;
/**
* Parses {@link AnalysedText} {@link Span}s including annotations from the
* {@link InputStream}. The {@link AnalysedText} instance that is going to
* be enrichted with the parsed data needs to be parsed. In the simplest case
* the caller can create an empty instance by using a
* {@link AnalysedTextFactory}.
* @param in The stream to read the data from
* @param charset the {@link Charset} used by the stream
* @param at The {@link AnalysedText} instance used to add the data to
* @return the parsed {@link AnalysedText} instance enrichted with the
* information parsed from the Stream
* @throws IOException on any Error while reading or parsing the data
* from the Stream
*/
public AnalysedText parse(InputStream in, Charset charset, final AnalysedText at) throws IOException {
if(in == null){
throw new IllegalArgumentException("The parsed InputStream MUST NOT be NULL!");
}
if(charset == null){
charset = UTF8;
}
JsonParser parser = mapper.getJsonFactory().createJsonParser(new InputStreamReader(in, charset));
if(parser.nextToken() != JsonToken.START_OBJECT) { //start object
throw new IOException("JSON serialized AnalyzedTexts MUST use a JSON Object as Root!");
}
if(!parser.nextFieldName(new SerializedString("spans"))){
throw new IOException("JSON serialized AnalyzedText MUST define the 'spans' field as first entry "
+ "in the root JSON object!");
}
if(parser.nextValue() != JsonToken.START_ARRAY){
throw new IOException("The value of the 'span' field MUST BE an Json Array!");
}
boolean first = true;
while(parser.nextValue() == JsonToken.START_OBJECT){
if(first){
parseAnalyzedTextSpan(parser.readValueAsTree(), at);
first = false;
} else {
parseSpan(at, parser.readValueAsTree());
}
}
return at;
}
private void parseAnalyzedTextSpan(JsonNode node, AnalysedText at) throws IOException {
if(node.isObject()){
ObjectNode jSpan = (ObjectNode)node;
int[] spanPos = new int[]{-1,-1};
Collection<Entry<String,JsonNode>> jAnnotations = new ArrayList<Entry<String,JsonNode>>(4);
SpanTypeEnum spanType = parseSpanData(jSpan, spanPos, jAnnotations);
if(spanType != SpanTypeEnum.Text || spanPos[0] != 0 || spanPos[1] < 0){
throw new IOException("The AnalyzedText span MUST have the SpanType 'text', a "
+ "start position of '0' and an end position (ignored, json: "+jSpan);
}
if(at.getEnd() != spanPos[1]){
throw new IOException("The size of the local text '"+at.getEnd()+"' does not "
+ "match the span of the parsed AnalyzedText ["+spanPos[0]+","+spanPos[1]+"]!");
}
parseAnnotations(at, jAnnotations);
} else {
throw new IOException("Unable to parse AnalyzedText span form JsonNode "+node+" (expected JSON object)!");
}
}
private void parseSpan(AnalysedText at, JsonNode node) throws IOException {
if(node.isObject()){
ObjectNode jSpan = (ObjectNode)node;
int[] spanPos = new int[]{-1,-1};
Collection<Entry<String,JsonNode>> jAnnotations = new ArrayList<Entry<String,JsonNode>>(4);
SpanTypeEnum spanType = parseSpanData(jSpan, spanPos, jAnnotations);
if(spanType == null || spanPos[0] < 0 || spanPos[1] < 0){
log.warn("Illegal or missing span type, start and/or end position (ignored, json: "+jSpan);
return;
}
//now create the Span
Span span;
switch (spanType) {
case Text:
log.warn("Encounterd 'Text' span that is not the first span in the "
+ "'spans' array (ignored, json: "+node+")");
return;
case TextSection:
log.warn("Encountered 'TextSection' span. This SpanTypeEnum entry "
+ "is currently unused. If this is no longer the case please "
+ "update this implementation (ignored, json: "+node+")");
return;
case Sentence:
span = at.addSentence(spanPos[0], spanPos[1]);
break;
case Chunk:
span = at.addChunk(spanPos[0], spanPos[1]);
break;
case Token:
span = at.addToken(spanPos[0], spanPos[1]);
break;
default:
log.warn("Unsupported SpanTypeEnum '"+spanType+"'!. Please "
+ "update this implementation (ignored, json: "+node+")");
return;
}
if(!jAnnotations.isEmpty()){
parseAnnotations(span,jAnnotations);
}
} else {
log.warn("Unable to parse Span form JsonNode "+node+" (expected JSON object)!");
}
}
/**
* @param jSpan
* @param spanPos
* @param jAnnotations
* @return the type of the parsed span
*/
private SpanTypeEnum parseSpanData(ObjectNode jSpan, int[] spanPos,
Collection<Entry<String,JsonNode>> jAnnotations) {
SpanTypeEnum spanType = null;
for(Iterator<Entry<String,JsonNode>> fields = jSpan.getFields(); fields.hasNext();){
Entry<String,JsonNode> field = fields.next();
if("type".equals(field.getKey())){
if(field.getValue().isTextual()){
spanType = SpanTypeEnum.valueOf(field.getValue().getTextValue());
} else if(field.getValue().isInt()){
spanType = SpanTypeEnum.values()[field.getValue().getIntValue()];
} else {
log.warn("Unable to parse SpanType form JSON field "+field +" (ignored, json: "+jSpan+")");
return null;
}
} else if("start".equals(field.getKey())){
if(field.getValue().isInt()){
spanPos[0] = field.getValue().getIntValue();
} else {
log.warn("Unable to parse span start position form JSON field "
+field +" (ignored, json: "+jSpan+")");
return null;
}
} else if("end".equals(field.getKey())){
if(field.getValue().isInt()){
spanPos[1] = field.getValue().getIntValue();
} else {
log.warn("Unable to parse span end position form JSON field "
+field +" (ignored, json: "+jSpan+")");
return null;
}
} else {
jAnnotations.add(field);
}
}
if(spanType == null){
log.warn("Missing required field 'type' defining the type of the Span!");
}
return spanType;
}
private void parseAnnotations(Span span, Collection<Entry<String,JsonNode>> jAnnotations) throws IOException {
for(Entry<String,JsonNode> jAnnotation : jAnnotations){
if(jAnnotation.getValue().isObject()){
parseAnnotation(span, jAnnotation.getKey(), (ObjectNode)jAnnotation.getValue());
} else if(jAnnotation.getValue().isArray()){
ArrayNode jValues = (ArrayNode)jAnnotation.getValue();
for(int i=0;i< jValues.size();i++){
JsonNode jValue = jValues.get(i);
if(jValue.isObject()){
parseAnnotation(span, jAnnotation.getKey(), (ObjectNode)jValue);
} else {
log.warn("unable to parse the {} value of the annotation {} "
+ "because value is no JSON object (ignored, json: {}",
new Object[]{i,jAnnotation.getKey(),jAnnotation.getValue()});
}
}
} else {
log.warn("unable to parse Annotation {} because value is no JSON object (ignored, json: {}",
jAnnotation.getKey(),jAnnotation.getValue());
}
}
}
private void parseAnnotation(Span span, String key, ObjectNode jValue) throws IOException {
JsonNode jClass = jValue.path("class");
if(!jClass.isTextual()){
log.warn("unable to parse Annotation {} because 'class' field "
+ "is not set or not a stringis no JSON object (ignored, json: {}",
key,jValue);
return;
}
Class<?> clazz;
try {
clazz = AnalyzedTextParser.class.getClassLoader().loadClass(jClass.getTextValue());
} catch (ClassNotFoundException e) {
log.warn("Unable to parse Annotation "+key
+ " because the 'class' "+jClass.getTextValue()+" of the "
+ "the value can not be resolved (ignored, json: "+jValue+")",e);
return;
}
ValueTypeParser<?> parser = this.valueTypeParserRegistry.getParser(clazz);
Object value;
if(parser != null){
value = parser.parse(jValue);
} else {
JsonNode valueNode = jValue.path("value");
if(valueNode.isMissingNode()){
log.warn("unable to parse value for annotation {} because the "
+ "field 'value' is not present (ignored, json: {}",
key,jValue);
return;
} else {
try {
value = mapper.treeToValue(valueNode, clazz);
} catch (JsonParseException e) {
log.warn("unable to parse value for annotation "
+ key+ "because the value can"
+ "not be converted to the class "+ clazz.getName()
+ "(ignored, json: "+jValue+")",e);
return;
} catch (JsonMappingException e) {
log.warn("unable to parse value for annotation "
+ key+ "because the value can"
+ "not be converted to the class "+ clazz.getName()
+ "(ignored, json: "+jValue+")",e);
return;
}
}
}
JsonNode jProb = jValue.path("prob");
if(!jProb.isDouble()){
span.addValue(key, Value.value(value));
} else {
span.addValue(key, Value.value(value,jProb.getDoubleValue()));
}
}
/**
* Parses the SpanType for the parsed {@link ObjectNode} representing a {@link Span}
* @param jSpan the JSON root node of the span
* @return the type or <code>null</code> if the information is missing
*/
private SpanTypeEnum parseSpanType(ObjectNode jSpan) {
EnumSet<SpanTypeEnum> spanTypes = JsonUtils.parseEnum(jSpan, "type", SpanTypeEnum.class);
if(spanTypes.isEmpty()){
log.warn("Unable to parse Span with missing 'type' (json: "+jSpan+")!");
return null;
}
if(spanTypes.size() > 1){
log.warn("Found Span with multiple 'types' (Json:"+jSpan+")!");
}
return spanTypes.iterator().next();
}
}