| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.nlp.json; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.nio.charset.Charset; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.EnumSet; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map.Entry; |
| |
| import javax.swing.JPanel; |
| |
| import org.apache.commons.io.IOUtils; |
| import org.apache.felix.scr.annotations.Component; |
| import org.apache.felix.scr.annotations.ConfigurationPolicy; |
| import org.apache.felix.scr.annotations.Reference; |
| import org.apache.felix.scr.annotations.Service; |
| import org.apache.stanbol.enhancer.nlp.json.valuetype.ValueTypeParser; |
| import org.apache.stanbol.enhancer.nlp.json.valuetype.ValueTypeParserRegistry; |
| import org.apache.stanbol.enhancer.nlp.model.AnalysedText; |
| import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory; |
| import org.apache.stanbol.enhancer.nlp.model.Span; |
| import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum; |
| import org.apache.stanbol.enhancer.nlp.model.annotation.Value; |
| import org.apache.stanbol.enhancer.servicesapi.Blob; |
| import org.codehaus.jackson.JsonFactory; |
| import org.codehaus.jackson.JsonNode; |
| import org.codehaus.jackson.JsonParseException; |
| import org.codehaus.jackson.JsonParser; |
| import org.codehaus.jackson.JsonToken; |
| import org.codehaus.jackson.io.SerializedString; |
| import org.codehaus.jackson.map.JsonMappingException; |
| import org.codehaus.jackson.map.ObjectMapper; |
| import org.codehaus.jackson.node.ArrayNode; |
| import org.codehaus.jackson.node.ObjectNode; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| @Component(immediate=true,policy=ConfigurationPolicy.IGNORE) |
| @Service(value=AnalyzedTextParser.class) |
| public class AnalyzedTextParser { |
| |
| private final Logger log = LoggerFactory.getLogger(AnalyzedTextParser.class); |
| |
| private final static Charset UTF8 = Charset.forName("UTF-8"); |
| |
| private static AnalyzedTextParser defaultInstance; |
| |
| protected ObjectMapper mapper = new ObjectMapper(); |
| /** |
| * Can be used when running outside of OSGI to obtain the default (singleton) |
| * instance. |
| * @return |
| */ |
| public static final AnalyzedTextParser getDefaultInstance(){ |
| if(defaultInstance == null){ |
| defaultInstance = new AnalyzedTextParser( |
| ValueTypeParserRegistry.getInstance()); |
| } |
| return defaultInstance; |
| } |
| |
| /** |
| * Default constructor used by OSGI |
| */ |
| public AnalyzedTextParser() {} |
| |
| /** |
| * Constructs a new Parser instance for the parsed {@link ValueTypeParserRegistry} |
| * instance. Typically this constructor should not be used as usages within |
| * an OSGI environment MUST lookup the service via the service registry. |
| * Usages outside an OSGI environment should prefer to use the |
| * {@link #getDefaultInstance()} instance to obtain the singleton instance. |
| * @param vtsr |
| */ |
| public AnalyzedTextParser(ValueTypeParserRegistry vtpr){ |
| if(vtpr == null){ |
| throw new IllegalArgumentException("The parsed ValueTypeParserRegistry MUST NOT be NULL!"); |
| } |
| this.valueTypeParserRegistry = vtpr; |
| } |
| |
| @Reference |
| protected ValueTypeParserRegistry valueTypeParserRegistry; |
| |
| /** |
| * Parses {@link AnalysedText} {@link Span}s including annotations from the |
| * {@link InputStream}. The {@link AnalysedText} instance that is going to |
| * be enrichted with the parsed data needs to be parsed. In the simplest case |
| * the caller can create an empty instance by using a |
| * {@link AnalysedTextFactory}. |
| * @param in The stream to read the data from |
| * @param charset the {@link Charset} used by the stream |
| * @param at The {@link AnalysedText} instance used to add the data to |
| * @return the parsed {@link AnalysedText} instance enrichted with the |
| * information parsed from the Stream |
| * @throws IOException on any Error while reading or parsing the data |
| * from the Stream |
| */ |
| public AnalysedText parse(InputStream in, Charset charset, final AnalysedText at) throws IOException { |
| if(in == null){ |
| throw new IllegalArgumentException("The parsed InputStream MUST NOT be NULL!"); |
| } |
| if(charset == null){ |
| charset = UTF8; |
| } |
| JsonParser parser = mapper.getJsonFactory().createJsonParser(new InputStreamReader(in, charset)); |
| if(parser.nextToken() != JsonToken.START_OBJECT) { //start object |
| throw new IOException("JSON serialized AnalyzedTexts MUST use a JSON Object as Root!"); |
| } |
| if(!parser.nextFieldName(new SerializedString("spans"))){ |
| throw new IOException("JSON serialized AnalyzedText MUST define the 'spans' field as first entry " |
| + "in the root JSON object!"); |
| } |
| if(parser.nextValue() != JsonToken.START_ARRAY){ |
| throw new IOException("The value of the 'span' field MUST BE an Json Array!"); |
| } |
| boolean first = true; |
| while(parser.nextValue() == JsonToken.START_OBJECT){ |
| if(first){ |
| parseAnalyzedTextSpan(parser.readValueAsTree(), at); |
| first = false; |
| } else { |
| parseSpan(at, parser.readValueAsTree()); |
| } |
| } |
| return at; |
| } |
| |
| private void parseAnalyzedTextSpan(JsonNode node, AnalysedText at) throws IOException { |
| if(node.isObject()){ |
| ObjectNode jSpan = (ObjectNode)node; |
| int[] spanPos = new int[]{-1,-1}; |
| Collection<Entry<String,JsonNode>> jAnnotations = new ArrayList<Entry<String,JsonNode>>(4); |
| SpanTypeEnum spanType = parseSpanData(jSpan, spanPos, jAnnotations); |
| if(spanType != SpanTypeEnum.Text || spanPos[0] != 0 || spanPos[1] < 0){ |
| throw new IOException("The AnalyzedText span MUST have the SpanType 'text', a " |
| + "start position of '0' and an end position (ignored, json: "+jSpan); |
| } |
| if(at.getEnd() != spanPos[1]){ |
| throw new IOException("The size of the local text '"+at.getEnd()+"' does not " |
| + "match the span of the parsed AnalyzedText ["+spanPos[0]+","+spanPos[1]+"]!"); |
| } |
| parseAnnotations(at, jAnnotations); |
| } else { |
| throw new IOException("Unable to parse AnalyzedText span form JsonNode "+node+" (expected JSON object)!"); |
| } |
| |
| } |
| |
| private void parseSpan(AnalysedText at, JsonNode node) throws IOException { |
| if(node.isObject()){ |
| ObjectNode jSpan = (ObjectNode)node; |
| int[] spanPos = new int[]{-1,-1}; |
| Collection<Entry<String,JsonNode>> jAnnotations = new ArrayList<Entry<String,JsonNode>>(4); |
| SpanTypeEnum spanType = parseSpanData(jSpan, spanPos, jAnnotations); |
| if(spanType == null || spanPos[0] < 0 || spanPos[1] < 0){ |
| log.warn("Illegal or missing span type, start and/or end position (ignored, json: "+jSpan); |
| return; |
| } |
| //now create the Span |
| Span span; |
| switch (spanType) { |
| case Text: |
| log.warn("Encounterd 'Text' span that is not the first span in the " |
| + "'spans' array (ignored, json: "+node+")"); |
| return; |
| case TextSection: |
| log.warn("Encountered 'TextSection' span. This SpanTypeEnum entry " |
| + "is currently unused. If this is no longer the case please " |
| + "update this implementation (ignored, json: "+node+")"); |
| return; |
| case Sentence: |
| span = at.addSentence(spanPos[0], spanPos[1]); |
| break; |
| case Chunk: |
| span = at.addChunk(spanPos[0], spanPos[1]); |
| break; |
| case Token: |
| span = at.addToken(spanPos[0], spanPos[1]); |
| break; |
| default: |
| log.warn("Unsupported SpanTypeEnum '"+spanType+"'!. Please " |
| + "update this implementation (ignored, json: "+node+")"); |
| return; |
| } |
| if(!jAnnotations.isEmpty()){ |
| parseAnnotations(span,jAnnotations); |
| } |
| } else { |
| log.warn("Unable to parse Span form JsonNode "+node+" (expected JSON object)!"); |
| } |
| } |
| |
| /** |
| * @param jSpan |
| * @param spanPos |
| * @param jAnnotations |
| * @return the type of the parsed span |
| */ |
| private SpanTypeEnum parseSpanData(ObjectNode jSpan, int[] spanPos, |
| Collection<Entry<String,JsonNode>> jAnnotations) { |
| SpanTypeEnum spanType = null; |
| for(Iterator<Entry<String,JsonNode>> fields = jSpan.getFields(); fields.hasNext();){ |
| Entry<String,JsonNode> field = fields.next(); |
| if("type".equals(field.getKey())){ |
| if(field.getValue().isTextual()){ |
| spanType = SpanTypeEnum.valueOf(field.getValue().getTextValue()); |
| } else if(field.getValue().isInt()){ |
| spanType = SpanTypeEnum.values()[field.getValue().getIntValue()]; |
| } else { |
| log.warn("Unable to parse SpanType form JSON field "+field +" (ignored, json: "+jSpan+")"); |
| return null; |
| } |
| } else if("start".equals(field.getKey())){ |
| if(field.getValue().isInt()){ |
| spanPos[0] = field.getValue().getIntValue(); |
| } else { |
| log.warn("Unable to parse span start position form JSON field " |
| +field +" (ignored, json: "+jSpan+")"); |
| return null; |
| } |
| } else if("end".equals(field.getKey())){ |
| if(field.getValue().isInt()){ |
| spanPos[1] = field.getValue().getIntValue(); |
| } else { |
| log.warn("Unable to parse span end position form JSON field " |
| +field +" (ignored, json: "+jSpan+")"); |
| return null; |
| } |
| } else { |
| jAnnotations.add(field); |
| } |
| } |
| if(spanType == null){ |
| log.warn("Missing required field 'type' defining the type of the Span!"); |
| } |
| return spanType; |
| } |
| |
| |
| private void parseAnnotations(Span span, Collection<Entry<String,JsonNode>> jAnnotations) throws IOException { |
| for(Entry<String,JsonNode> jAnnotation : jAnnotations){ |
| if(jAnnotation.getValue().isObject()){ |
| parseAnnotation(span, jAnnotation.getKey(), (ObjectNode)jAnnotation.getValue()); |
| } else if(jAnnotation.getValue().isArray()){ |
| ArrayNode jValues = (ArrayNode)jAnnotation.getValue(); |
| for(int i=0;i< jValues.size();i++){ |
| JsonNode jValue = jValues.get(i); |
| if(jValue.isObject()){ |
| parseAnnotation(span, jAnnotation.getKey(), (ObjectNode)jValue); |
| } else { |
| log.warn("unable to parse the {} value of the annotation {} " |
| + "because value is no JSON object (ignored, json: {}", |
| new Object[]{i,jAnnotation.getKey(),jAnnotation.getValue()}); |
| } |
| } |
| } else { |
| log.warn("unable to parse Annotation {} because value is no JSON object (ignored, json: {}", |
| jAnnotation.getKey(),jAnnotation.getValue()); |
| } |
| } |
| |
| } |
| |
| private void parseAnnotation(Span span, String key, ObjectNode jValue) throws IOException { |
| JsonNode jClass = jValue.path("class"); |
| if(!jClass.isTextual()){ |
| log.warn("unable to parse Annotation {} because 'class' field " |
| + "is not set or not a stringis no JSON object (ignored, json: {}", |
| key,jValue); |
| return; |
| } |
| Class<?> clazz; |
| try { |
| clazz = AnalyzedTextParser.class.getClassLoader().loadClass(jClass.getTextValue()); |
| } catch (ClassNotFoundException e) { |
| log.warn("Unable to parse Annotation "+key |
| + " because the 'class' "+jClass.getTextValue()+" of the " |
| + "the value can not be resolved (ignored, json: "+jValue+")",e); |
| return; |
| } |
| ValueTypeParser<?> parser = this.valueTypeParserRegistry.getParser(clazz); |
| Object value; |
| if(parser != null){ |
| value = parser.parse(jValue); |
| } else { |
| JsonNode valueNode = jValue.path("value"); |
| if(valueNode.isMissingNode()){ |
| log.warn("unable to parse value for annotation {} because the " |
| + "field 'value' is not present (ignored, json: {}", |
| key,jValue); |
| return; |
| } else { |
| try { |
| value = mapper.treeToValue(valueNode, clazz); |
| } catch (JsonParseException e) { |
| log.warn("unable to parse value for annotation " |
| + key+ "because the value can" |
| + "not be converted to the class "+ clazz.getName() |
| + "(ignored, json: "+jValue+")",e); |
| return; |
| } catch (JsonMappingException e) { |
| log.warn("unable to parse value for annotation " |
| + key+ "because the value can" |
| + "not be converted to the class "+ clazz.getName() |
| + "(ignored, json: "+jValue+")",e); |
| return; |
| } |
| } |
| } |
| JsonNode jProb = jValue.path("prob"); |
| if(!jProb.isDouble()){ |
| span.addValue(key, Value.value(value)); |
| } else { |
| span.addValue(key, Value.value(value,jProb.getDoubleValue())); |
| } |
| } |
| |
| |
| /** |
| * Parses the SpanType for the parsed {@link ObjectNode} representing a {@link Span} |
| * @param jSpan the JSON root node of the span |
| * @return the type or <code>null</code> if the information is missing |
| */ |
| private SpanTypeEnum parseSpanType(ObjectNode jSpan) { |
| EnumSet<SpanTypeEnum> spanTypes = JsonUtils.parseEnum(jSpan, "type", SpanTypeEnum.class); |
| if(spanTypes.isEmpty()){ |
| log.warn("Unable to parse Span with missing 'type' (json: "+jSpan+")!"); |
| return null; |
| } |
| if(spanTypes.size() > 1){ |
| log.warn("Found Span with multiple 'types' (Json:"+jSpan+")!"); |
| } |
| return spanTypes.iterator().next(); |
| } |
| |
| } |