| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.metadata; |
| |
| import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; |
| import org.apache.tika.sax.ContentHandlerFactory; |
| import org.apache.tika.sax.RecursiveParserWrapperHandler; |
| import org.apache.tika.utils.ParserUtils; |
| |
| /** |
| * Contains a core set of basic Tika metadata properties, which all parsers |
| * will attempt to supply (where the file format permits). These are all |
| * defined in terms of other standard namespaces. |
| * |
| * Users of Tika who wish to have consistent metadata across file formats |
| * can make use of these Properties, knowing that where present they will |
| * have consistent semantic meaning between different file formats. (No |
| * matter if one file format calls it Title, another Long-Title and another |
| * Long-Name, if they all mean the same thing as defined by |
| * {@link DublinCore#TITLE} then they will all be present as such) |
| * |
| * For now, most of these properties are composite ones including the deprecated |
| * non-prefixed String properties from the Metadata class. In Tika 2.0, most |
| * of these will revert back to simple assignments. |
| * |
| * @since Apache Tika 1.2 |
| */ |
| @SuppressWarnings("deprecation") |
| public interface TikaCoreProperties { |
| |
| /** |
| * A file might contain different types of embedded documents. |
| * The most common is the ATTACHMENT. |
| * <p> |
| * An INLINE embedded resource should be used for embedded image |
| * files that are used to render the page image (as in PDXObjImages in PDF files). |
| * <p> |
| * A MACRO is code that is embedded in the document and is intended |
| * to be executable within the application that opens the document. This |
| * includes traditional macros within Microsoft Office files and |
| * javascript within PDFActions. This would not include, e.g., an |
| * .exe file embedded in a .zip file. |
| * <p> |
| * Not all parsers have yet implemented this. |
| * |
| */ |
| public enum EmbeddedResourceType { |
| INLINE, //image that is intended to be displayed in a rendering of the file |
| ATTACHMENT,//standard attachment as in email |
| MACRO, //any code that is intended to be run by the application |
| METADATA, //e.g. xmp, xfa |
| FONT,//embedded font files |
| THUMBNAIL;//TODO: set this in parsers that handle thumbnails |
| }; |
| |
| /** |
| * The common delimiter used between the namespace abbreviation and the property name |
| */ |
| String NAMESPACE_PREFIX_DELIMITER = ":"; |
| |
| /** |
| * Use this to prefix metadata properties that store information |
| * about the parsing process. Users should be able to distinguish |
| * between metadata that was contained within the document and |
| * metadata about the parsing process. |
| */ |
| String TIKA_META_PREFIX = "X-TIKA"+NAMESPACE_PREFIX_DELIMITER; |
| |
| Property EMBEDDED_DEPTH = |
| Property.internalInteger(TIKA_META_PREFIX +"embedded_depth"); |
| Property EMBEDDED_RESOURCE_PATH = |
| Property.internalText(TIKA_META_PREFIX +"embedded_resource_path"); |
| Property PARSE_TIME_MILLIS = Property.internalText(TIKA_META_PREFIX + "parse_time_millis"); |
| /** |
| * Simple class name of the content handler |
| */ |
| Property TIKA_CONTENT_HANDLER = Property.internalText(TIKA_META_PREFIX +"content_handler"); |
| Property TIKA_CONTENT = Property.internalText(TIKA_META_PREFIX +"content"); |
| |
| /** |
| * Use this to store parse exception information in the Metadata object. |
| */ |
| String TIKA_META_EXCEPTION_PREFIX = TIKA_META_PREFIX+"EXCEPTION"+ |
| NAMESPACE_PREFIX_DELIMITER; |
| |
| //exception in main file |
| Property CONTAINER_EXCEPTION = Property.internalText( |
| TIKA_META_EXCEPTION_PREFIX +"container_exception"); |
| |
| //exception in an embedded file |
| Property EMBEDDED_EXCEPTION = |
| Property.internalText( |
| TIKA_META_EXCEPTION_PREFIX + "embedded_exception"); |
| |
| Property WRITE_LIMIT_REACHED = |
| Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "write_limit_reached"); |
| |
| Property EMBEDDED_RESOURCE_LIMIT_REACHED = |
| Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached"); |
| |
| /** |
| * Use this to store exceptions caught during a parse that are |
| * non-fatal, e.g. if a parser is in lenient mode and more |
| * content can be extracted if we ignore an exception thrown by |
| * a dependency. |
| */ |
| public static final Property TIKA_META_EXCEPTION_WARNING = |
| Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX+"warn"); |
| |
| /** |
| * Use this to store exceptions caught while trying to read the |
| * stream of an embedded resource. Do not use this if there is |
| * a parse exception on the embedded resource. |
| */ |
| Property TIKA_META_EXCEPTION_EMBEDDED_STREAM = |
| Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX+"embedded_stream_exception"); |
| |
| Property TIKA_PARSED_BY = |
| Property.internalTextBag(TIKA_META_PREFIX+"Parsed-By"); |
| |
| String RESOURCE_NAME_KEY = "resourceName"; |
| |
| String PROTECTED = "protected"; |
| |
| String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId"; |
| |
| String EMBEDDED_STORAGE_CLASS_ID = "embeddedStorageClassId"; |
| |
| String EMBEDDED_RESOURCE_TYPE_KEY = "embeddedResourceType"; |
| |
| |
| /** |
| * Some file formats can store information about their original |
| * file name/location or about their attachment's original file name/location |
| * within the file. |
| */ |
| Property ORIGINAL_RESOURCE_NAME = |
| Property.internalTextBag(TIKA_META_PREFIX+"origResourceName"); |
| |
| /** |
| * This should be used to store the path (relative or full) |
| * of the source file, including the file name, |
| * e.g. doc/path/to/my_pdf.pdf |
| * |
| * This can also be used for a primary key within a database. |
| */ |
| Property SOURCE_PATH = |
| Property.internalText(TIKA_META_PREFIX+"sourcePath"); |
| /** |
| * This is currently used to identify Content-Type that may be |
| * included within a document, such as in html documents |
| * (e.g. <meta http-equiv="content-type" content="text/html; charset=UTF-8">) |
| , or the value might come from outside the document. This information |
| * may be faulty and should be treated only as a hint. |
| */ |
| Property CONTENT_TYPE_HINT = |
| Property.internalText(HttpHeaders.CONTENT_TYPE+"-Hint"); |
| |
| /** |
| * This is used by users to override detection with the override detector. |
| */ |
| Property CONTENT_TYPE_USER_OVERRIDE = |
| Property.internalText(HttpHeaders.CONTENT_TYPE+"-Override"); |
| |
| |
| /** |
| * This is used by parsers to override detection of embedded resources |
| * with the override detector. |
| */ |
| Property CONTENT_TYPE_PARSER_OVERRIDE = |
| Property.internalText(HttpHeaders.CONTENT_TYPE+"-Parser-Override"); |
| |
| /** |
| * @see DublinCore#FORMAT |
| */ |
| Property FORMAT = DublinCore.FORMAT; |
| |
| /** |
| * @see DublinCore#IDENTIFIER |
| */ |
| Property IDENTIFIER = DublinCore.IDENTIFIER; |
| |
| /** |
| * @see DublinCore#CONTRIBUTOR |
| */ |
| Property CONTRIBUTOR = DublinCore.CONTRIBUTOR; |
| |
| /** |
| * @see DublinCore#COVERAGE |
| */ |
| Property COVERAGE = DublinCore.COVERAGE; |
| |
| /** |
| * @see DublinCore#CREATOR |
| */ |
| Property CREATOR = DublinCore.CREATOR; |
| |
| /** |
| * @see Office#LAST_AUTHOR |
| */ |
| Property MODIFIER = Office.LAST_AUTHOR; |
| |
| /** |
| * @see XMP#CREATOR_TOOL |
| */ |
| Property CREATOR_TOOL = XMP.CREATOR_TOOL; |
| |
| /** |
| * @see DublinCore#LANGUAGE |
| */ |
| Property LANGUAGE = DublinCore.LANGUAGE; |
| |
| /** |
| * @see DublinCore#PUBLISHER |
| */ |
| Property PUBLISHER = DublinCore.PUBLISHER; |
| |
| /** |
| * @see DublinCore#RELATION |
| */ |
| Property RELATION = DublinCore.RELATION; |
| |
| /** |
| * @see DublinCore#RIGHTS |
| */ |
| Property RIGHTS = DublinCore.RIGHTS; |
| |
| /** |
| * @see DublinCore#SOURCE |
| */ |
| Property SOURCE = DublinCore.SOURCE; |
| |
| /** |
| * @see DublinCore#TYPE |
| */ |
| Property TYPE = DublinCore.TYPE; |
| |
| // Descriptive properties |
| |
| /** |
| * @see DublinCore#TITLE |
| */ |
| Property TITLE = DublinCore.TITLE; |
| |
| /** |
| * @see DublinCore#DESCRIPTION |
| */ |
| Property DESCRIPTION = DublinCore.DESCRIPTION; |
| |
| /** |
| * {@link DublinCore#SUBJECT}; should include both subject and keywords |
| * if a document format has both. See also {@link Office#KEYWORDS} |
| * and {@link OfficeOpenXMLCore#SUBJECT}. |
| */ |
| Property SUBJECT = DublinCore.SUBJECT; |
| |
| // Date related properties |
| |
| /** |
| * @see DublinCore#DATE |
| */ |
| Property CREATED = DublinCore.CREATED; |
| |
| /** |
| * @see DublinCore#MODIFIED |
| * @see Office#SAVE_DATE |
| */ |
| Property MODIFIED = DublinCore.MODIFIED; |
| |
| /** @see Office#PRINT_DATE */ |
| Property PRINT_DATE = Office.PRINT_DATE; |
| |
| /** |
| * @see XMP#METADATA_DATE |
| */ |
| Property METADATA_DATE = XMP.METADATA_DATE; |
| |
| |
| // Geographic related properties |
| |
| /** |
| * @see Geographic#LATITUDE |
| */ |
| Property LATITUDE = Geographic.LATITUDE; |
| |
| /** |
| * @see Geographic#LONGITUDE |
| */ |
| Property LONGITUDE = Geographic.LONGITUDE; |
| |
| /** |
| * @see Geographic#ALTITUDE |
| */ |
| Property ALTITUDE = Geographic.ALTITUDE; |
| |
| |
| // Comment and rating properties |
| |
| /** |
| * @see XMP#RATING |
| */ |
| Property RATING = XMP.RATING; |
| |
| /** |
| * @see OfficeOpenXMLExtended#COMMENTS |
| */ |
| Property COMMENTS = OfficeOpenXMLExtended.COMMENTS; |
| |
| /** |
| * Embedded resource type property |
| */ |
| Property EMBEDDED_RESOURCE_TYPE = |
| Property.internalClosedChoise(EMBEDDED_RESOURCE_TYPE_KEY, |
| EmbeddedResourceType.ATTACHMENT.toString(), |
| EmbeddedResourceType.INLINE.toString(), |
| EmbeddedResourceType.METADATA.toString(), |
| EmbeddedResourceType.MACRO.toString(), |
| EmbeddedResourceType.THUMBNAIL.toString()); |
| |
| |
| |
| Property HAS_SIGNATURE = Property.internalBoolean("hasSignature"); |
| } |