| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.tika.metadata; |
| |
| import static org.apache.tika.metadata.DublinCore.DATE; |
| |
| import java.math.BigDecimal; |
| import java.math.BigInteger; |
| import java.net.URI; |
| import java.net.URISyntaxException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.Date; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.LinkedHashMap; |
| import java.util.LinkedHashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.clerezza.rdf.core.BNode; |
| import org.apache.clerezza.rdf.core.InvalidLiteralTypeException; |
| import org.apache.clerezza.rdf.core.LiteralFactory; |
| import org.apache.clerezza.rdf.core.MGraph; |
| import org.apache.clerezza.rdf.core.NoConvertorException; |
| import org.apache.clerezza.rdf.core.NonLiteral; |
| import org.apache.clerezza.rdf.core.PlainLiteral; |
| import org.apache.clerezza.rdf.core.Resource; |
| import org.apache.clerezza.rdf.core.TypedLiteral; |
| import org.apache.clerezza.rdf.core.UriRef; |
| import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl; |
| import org.apache.clerezza.rdf.core.impl.TripleImpl; |
| import org.apache.clerezza.rdf.core.impl.TypedLiteralImpl; |
| import org.apache.clerezza.rdf.ontologies.RDFS; |
| import org.apache.clerezza.rdf.ontologies.XSD; |
| import org.apache.tika.metadata.DublinCore; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.metadata.Property; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Used as value for Apache Tika {@link Metadata} mappings. Holds the |
| * ontology property as {@link UriRef} and optionally a Tika {@link Property}. |
| * Later can be used to parse the correct datatype for values contained in the |
| * {@link Metadata} |
| * |
| * @author westei |
| * |
| */ |
| public abstract class Mapping { |
| |
| private final static Logger log = LoggerFactory.getLogger(Mapping.class); |
| private static final LiteralFactory lf = LiteralFactory.getInstance(); |
| |
| /** |
| * List with allowed DataTypes.<ul> |
| * <li> <code>null</code> is used for {@link PlainLiteral}s |
| * <li> {@link XSD} datatyoes are used for {@link TypedLiteral}s |
| * <li> {@link RDFS#Resource} is used for {@link NonLiteral} values. Note |
| * that only {@link UriRef} is supported, because for Tika {@link BNode}s |
| * do not make sense. |
| * </ul> |
| */ |
| public static final Set<UriRef> ONT_TYPES; |
| /** |
| * Map with the same keys as contained in {@link #ONT_TYPES}. The values |
| * are the java types. |
| */ |
| protected static final Map<UriRef,Class<?>> ONT_TYPE_MAP; |
| |
| static { |
| //use a linked HasSetMap to have the nice ordering (mainly for logging) |
| Map<UriRef,Class<?>> map = new LinkedHashMap<UriRef,Class<?>>(); |
| //Plain Literal values |
| map.put(null,null); |
| //Typed Literal values |
| map.put(XSD.anyURI,URI.class); |
| map.put(XSD.base64Binary,new byte[]{}.getClass()); |
| map.put(XSD.boolean_,Boolean.class); |
| map.put(XSD.byte_,Byte.class); |
| map.put(XSD.date,Date.class); |
| map.put(XSD.dateTime,Date.class); |
| map.put(XSD.decimal,BigDecimal.class); |
| map.put(XSD.double_,Double.class); |
| map.put(XSD.float_,Float.class); |
| map.put(XSD.int_,Integer.class); |
| map.put(XSD.integer,BigInteger.class); |
| map.put(XSD.long_,Long.class); |
| map.put(XSD.short_,Short.class); |
| map.put(XSD.string,String.class); |
| map.put(XSD.time,Date.class); |
| //Data Types for NonLiteral values |
| map.put(RDFS.Resource,URI.class); |
| ONT_TYPE_MAP = Collections.unmodifiableMap(map); |
| ONT_TYPES = ONT_TYPE_MAP.keySet(); |
| |
| //NOTE: The following XSD types are not included |
| //XSD.gDay,XSD.gMonth,XSD.gMonthDay,XSD.gYearMonth,XSD.hexBinary,XSD.language, |
| //XSD.Name,XSD.NCName,XSD.negativeInteger,XSD.NMTOKEN,XSD.nonNegativeInteger, |
| //XSD.normalizedString,XSD.positiveInteger, |
| //XSD.token,XSD.unsignedByte,XSD.unsignedInt,XSD.unsignedLong,XSD.unsignedShort, |
| } |
| |
| protected final UriRef ontProperty; |
| |
| protected final Converter converter; |
| /** |
| * Getter for the OntologyProperty for this mapping |
| * @return the ontProperty |
| */ |
| public final UriRef getOntologyProperty() { |
| return ontProperty; |
| } |
| /** |
| * Getter for the set of Tika {@link Metadata} key names that are used |
| * by this mapping. This is typically used to determine if based on the |
| * present {@link Metadata#names()} a mapping need to be processed or not. |
| * <p>Mappings need to be called if any of the returned keys is present in |
| * the {@link Metadata}. Mappings that return an empty list MUST BE |
| * called. |
| * @return the Tika {@link Metadata} key names that are used by this mapping. |
| * If no keys are mapped than it MUST return an empty list. |
| */ |
| public abstract Set<String> getMappedTikaProperties(); |
| |
| protected final UriRef ontType; |
| |
| protected Mapping(UriRef ontProperty,UriRef ontType){ |
| this(ontProperty,ontType,null); |
| } |
| protected Mapping(UriRef ontProperty,UriRef ontType,Converter converter){ |
| if(ontProperty == null){ |
| throw new IllegalArgumentException("The parsed ontology property MUST NOT be NULL!"); |
| } |
| this.ontProperty = ontProperty; |
| if(!ONT_TYPES.contains(ontType)){ |
| throw new IllegalArgumentException("The ontology type '"+ontType |
| + "' is not supported. (supported: "+ONT_TYPES+")"); |
| } |
| this.ontType = ontType; |
| this.converter = converter; |
| } |
| |
| /** |
| * Applies this mapping based on the parsed {@link Metadata} and stores the |
| * results to {@link MGraph} |
| * @param graph the Graph to store the mapping results |
| * @param subject the subject (context) to add the mappings |
| * @param metadata the metadata used for applying the mapping |
| * @return <code>true</code> if the mapping could be applied based on the |
| * parsed data. Otherwise <code>false</code>. This is intended to be used |
| * by components that need to check if required mappings could be applied. |
| */ |
| public abstract boolean apply(MGraph graph, NonLiteral subject, Metadata metadata); |
| /** |
| * Converts the parsed value based on the mapping information to an RDF |
| * {@link Resource}. Optionally supports also validation if the parsed |
| * value is valid for the {@link Mapping#ontType ontology type} specified by |
| * the parsed mapping. |
| * @param value the value |
| * @param mapping the mapping |
| * @param validate |
| * @return the {@link Resource} or <code>null</code> if the parsed value is |
| * <code>null</code> or {@link String#isEmpty() empty}. |
| * @throws IllegalArgumentException if the parsed {@link Mapping} is |
| * <code>null</code> |
| */ |
| protected Resource toResource(String value, boolean validate){ |
| Metadata dummy = null;//used for date validation |
| if(value == null || value.isEmpty()){ |
| return null; //ignore null and empty values |
| } |
| Resource object; |
| if(ontType == null){ |
| object = new PlainLiteralImpl(value); |
| } else if(ontType == RDFS.Resource){ |
| try { |
| if(validate){ |
| new URI(value); |
| } |
| object = new UriRef(value); |
| } catch (URISyntaxException e) { |
| log.warn("Unable to create Reference for value {} (not a valid URI)" + |
| " -> create a literal instead",value); |
| object = new PlainLiteralImpl(value); |
| } |
| } else { //typed literal |
| Class<?> clazz = Mapping.ONT_TYPE_MAP.get(ontType); |
| if(clazz.equals(Date.class)){ //special handling for dates :( |
| //Dates are special, because Clerezza requires W3C date format |
| //and Tika uses the iso8601 variants. |
| //Because of that here is Tika used to get the Date object for |
| //the parsed value and than the LiteralFactory of Clerezza to |
| //create the TypedLiteral. |
| //Note that because of that no validation is required for |
| //Dates. |
| |
| //Need a dummy metadata object to get access to the private |
| //parseDate(..) method |
| if(dummy == null) { |
| dummy = new Metadata(); |
| } |
| //any Property with the Date type could be used here |
| dummy.add(DATE.getName(), value); |
| Date date = dummy.getDate(DublinCore.DATE); //access parseDate(..) |
| if(date != null){ //now use the Clerezza Literal factory |
| object = lf.createTypedLiteral(date); |
| } else { //fall back to xsd:string |
| object = new TypedLiteralImpl(value, XSD.string); |
| } |
| } else { |
| object = new TypedLiteralImpl(value, ontType); |
| } |
| if(validate && clazz != null && |
| !clazz.equals(Date.class)){ //we need not to validate dates |
| try { |
| lf.createObject(clazz,(TypedLiteral)object); |
| } catch (NoConvertorException e) { |
| log.info("Unable to validate typed literals of type {} because" + |
| "there is no converter for Class {} registered with Clerezza", |
| ontType,clazz); |
| } catch (InvalidLiteralTypeException e) { |
| log.info("The value '{}' is not valid for dataType {}!" + |
| "create literal with type 'xsd:string' instead", |
| value,ontType); |
| object = new TypedLiteralImpl(value, XSD.string); |
| } |
| } //else no validation needed |
| } |
| if(converter != null){ |
| object = converter.convert(object); |
| } |
| return object; |
| } |
| /** |
| * Used by subclasses to log mapped information |
| */ |
| protected final static MappingLogger mappingLogger = new MappingLogger(); |
| /** |
| * Allows nicely formatted logging of mapped properties |
| * @author Rupert Westenthaler |
| * |
| */ |
| protected static class MappingLogger{ |
| |
| private List<NonLiteral> subjects = new ArrayList<NonLiteral>(); |
| private UriRef predicate; |
| private final int intendSize = 2; |
| private final char[] intnedArray; |
| private static final int MAX_INTEND = 5; |
| |
| private MappingLogger(){ |
| intnedArray = new char[MAX_INTEND*intendSize]; |
| Arrays.fill(intnedArray, ' '); |
| } |
| private String getIntend(int intend){ |
| return String.copyValueOf(intnedArray, 0, |
| Math.min(MAX_INTEND, intend)*intendSize); |
| } |
| |
| protected void log(NonLiteral subject,UriRef predicate, String prop, Resource object){ |
| if(!log.isDebugEnabled()){ |
| return; |
| } |
| int intendCount = subjects.indexOf(subject)+1; |
| final String intend; |
| if(intendCount < 1){ |
| subjects.add(subject); |
| intendCount = subjects.size(); |
| intend = getIntend(intendCount); |
| log.debug("{}context: {}",intend,subject); |
| } else if(intendCount < subjects.size()){ |
| for(int i = intendCount;i<subjects.size();i++){ |
| subjects.remove(i); |
| } |
| intend = getIntend(intendCount); |
| } else { |
| intend = getIntend(intendCount); |
| } |
| if(!predicate.equals(this.predicate)){ |
| log.debug("{} {}",intend,predicate); |
| } |
| log.debug("{} {} {}",new Object[]{ |
| intend,object,prop != null ? ("(from: '"+prop+')') : "" |
| }); |
| } |
| } |
| |
| public static interface Converter { |
| Resource convert(Resource value); |
| } |
| } |