| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.mime; |
| |
| import org.apache.tika.detect.MagicDetector; |
| import org.w3c.dom.Attr; |
| import org.w3c.dom.Node; |
| import org.w3c.dom.Element; |
| import org.w3c.dom.Document; |
| import org.w3c.dom.NodeList; |
| import org.w3c.dom.NamedNodeMap; |
| import org.xml.sax.InputSource; |
| import org.xml.sax.SAXException; |
| |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import javax.xml.parsers.DocumentBuilder; |
| import javax.xml.parsers.DocumentBuilderFactory; |
| import javax.xml.parsers.ParserConfigurationException; |
| |
| /** |
| * A reader for XML files compliant with the freedesktop MIME-info DTD. |
| * |
| * <pre> |
| * <!DOCTYPE mime-info [ |
| * <!ELEMENT mime-info (mime-type)+> |
| * <!ATTLIST mime-info xmlns CDATA #FIXED "http://www.freedesktop.org/standards/shared-mime-info"> |
| * |
| * <!ELEMENT mime-type (comment|acronym|expanded-acronym|glob|magic|root-XML|alias|sub-class-of)*> |
| * <!ATTLIST mime-type type CDATA #REQUIRED> |
| * |
| * <!-- a comment describing a document with the respective MIME type. Example: "WMV video" --> |
| * <!ELEMENT _comment (#PCDATA)> |
| * <!ATTLIST _comment xml:lang CDATA #IMPLIED> |
| * |
| * <!-- a comment describing a the respective unexpanded MIME type acronym. Example: "WMV" --> |
| * <!ELEMENT acronym (#PCDATA)> |
| * <!ATTLIST acronym xml:lang CDATA #IMPLIED> |
| * |
| * <!-- a comment describing a the respective unexpanded MIME type acronym. Example: "Windows Media Video" --> |
| * <!ELEMENT expanded-acronym (#PCDATA)> |
| * <!ATTLIST expanded-acronym xml:lang CDATA #IMPLIED> |
| * |
| * <!ELEMENT glob EMPTY> |
| * <!ATTLIST glob pattern CDATA #REQUIRED> |
| * <!ATTLIST glob isregex CDATA #IMPLIED> |
| * |
| * <!ELEMENT magic (match)+> |
| * <!ATTLIST magic priority CDATA #IMPLIED> |
| * |
| * <!ELEMENT match (match)*> |
| * <!ATTLIST match offset CDATA #REQUIRED> |
| * <!ATTLIST match type (string|big16|big32|little16|little32|host16|host32|byte) #REQUIRED> |
| * <!ATTLIST match value CDATA #REQUIRED> |
| * <!ATTLIST match mask CDATA #IMPLIED> |
| * |
| * <!ELEMENT root-XML EMPTY> |
| * <!ATTLIST root-XML |
| * namespaceURI CDATA #REQUIRED |
| * localName CDATA #REQUIRED> |
| * |
| * <!ELEMENT alias EMPTY> |
| * <!ATTLIST alias |
| * type CDATA #REQUIRED> |
| * |
| * <!ELEMENT sub-class-of EMPTY> |
| * <!ATTLIST sub-class-of |
| * type CDATA #REQUIRED> |
| * ]> |
| * </pre> |
| * |
| * |
| * @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec |
| * |
| */ |
| final class MimeTypesReader implements MimeTypesReaderMetKeys { |
| |
| private final MimeTypes types; |
| |
| MimeTypesReader(MimeTypes types) { |
| this.types = types; |
| } |
| |
| void read(InputStream stream) throws IOException, MimeTypeException { |
| try { |
| DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); |
| DocumentBuilder builder = factory.newDocumentBuilder(); |
| Document document = builder.parse(new InputSource(stream)); |
| read(document); |
| } catch (ParserConfigurationException e) { |
| throw new MimeTypeException("Unable to create an XML parser", e); |
| } catch (SAXException e) { |
| throw new MimeTypeException("Invalid type configuration", e); |
| } |
| } |
| |
| void read(Document document) throws MimeTypeException { |
| Element element = document.getDocumentElement(); |
| if (element != null && element.getTagName().equals(MIME_INFO_TAG)) { |
| NodeList nodes = element.getChildNodes(); |
| for (int i = 0; i < nodes.getLength(); i++) { |
| Node node = nodes.item(i); |
| if (node.getNodeType() == Node.ELEMENT_NODE) { |
| Element child = (Element) node; |
| if (child.getTagName().equals(MIME_TYPE_TAG)) { |
| readMimeType(child); |
| } |
| } |
| } |
| } else { |
| throw new MimeTypeException( |
| "Not a <" + MIME_INFO_TAG + "/> configuration document: " |
| + element.getTagName()); |
| } |
| } |
| |
| /** Read Element named mime-type. */ |
| private void readMimeType(Element element) throws MimeTypeException { |
| String name = element.getAttribute(MIME_TYPE_TYPE_ATTR); |
| MimeType type = types.forName(name); |
| |
| NodeList nodes = element.getChildNodes(); |
| for (int i = 0; i < nodes.getLength(); i++) { |
| Node node = nodes.item(i); |
| if (node.getNodeType() == Node.ELEMENT_NODE) { |
| Element nodeElement = (Element) node; |
| if (nodeElement.getTagName().equals(COMMENT_TAG)) { |
| type.setDescription( |
| nodeElement.getFirstChild().getNodeValue()); |
| } else if (nodeElement.getTagName().equals(GLOB_TAG)) { |
| boolean useRegex = Boolean.valueOf(nodeElement.getAttribute(ISREGEX_ATTR)); |
| types.addPattern(type, nodeElement.getAttribute(PATTERN_ATTR), useRegex); |
| } else if (nodeElement.getTagName().equals(MAGIC_TAG)) { |
| readMagic(nodeElement, type); |
| } else if (nodeElement.getTagName().equals(ALIAS_TAG)) { |
| String alias = nodeElement.getAttribute(ALIAS_TYPE_ATTR); |
| MediaType aliasType = MediaType.parse(alias); |
| if (aliasType != null) { |
| types.addAlias(type, aliasType); |
| } else { |
| throw new MimeTypeException( |
| "Invalid media type alias: " + alias); |
| } |
| } else if (nodeElement.getTagName().equals(ROOT_XML_TAG)) { |
| readRootXML(nodeElement, type); |
| } else if (nodeElement.getTagName().equals(SUB_CLASS_OF_TAG)) { |
| String parent = nodeElement.getAttribute(SUB_CLASS_TYPE_ATTR); |
| types.setSuperType(type, MediaType.parse(parent)); |
| } |
| } |
| } |
| |
| types.add(type); |
| } |
| |
| /** |
| * Read Element named magic. |
| * @throws MimeTypeException if the configuration is invalid |
| */ |
| private void readMagic(Element element, MimeType mimeType) |
| throws MimeTypeException { |
| int priority = 50; |
| String value = element.getAttribute(MAGIC_PRIORITY_ATTR); |
| if (value != null && value.length() > 0) { |
| priority = Integer.parseInt(value); |
| } |
| |
| for (Clause clause : readMatches(element, mimeType.getType())) { |
| Magic magic = new Magic(mimeType); |
| magic.setPriority(priority); |
| magic.setClause(clause); |
| mimeType.addMagic(magic); |
| } |
| } |
| |
| private List<Clause> readMatches(Element element, MediaType mediaType) throws MimeTypeException { |
| List<Clause> clauses = new ArrayList<Clause>(); |
| NodeList nodes = element.getChildNodes(); |
| for (int i = 0; i < nodes.getLength(); i++) { |
| Node node = nodes.item(i); |
| if (node.getNodeType() == Node.ELEMENT_NODE) { |
| Element nodeElement = (Element) node; |
| if (nodeElement.getTagName().equals(MATCH_TAG)) { |
| clauses.add(readMatch(nodeElement, mediaType)); |
| } |
| } |
| } |
| return clauses; |
| } |
| |
| /** Read Element named match. */ |
| private Clause readMatch(Element element, MediaType mediaType) throws MimeTypeException { |
| String type = "string"; |
| int start = 0; |
| int end = 0; |
| String value = null; |
| String mask = null; |
| |
| NamedNodeMap attrs = element.getAttributes(); |
| for (int i = 0; i < attrs.getLength(); i++) { |
| Attr attr = (Attr) attrs.item(i); |
| if (attr.getName().equals(MATCH_OFFSET_ATTR)) { |
| String offset = attr.getValue(); |
| int colon = offset.indexOf(':'); |
| if (colon == -1) { |
| start = Integer.parseInt(offset); |
| end = start; |
| } else { |
| start = Integer.parseInt(offset.substring(0, colon)); |
| end = Integer.parseInt(offset.substring(colon + 1)); |
| } |
| } else if (attr.getName().equals(MATCH_TYPE_ATTR)) { |
| type = attr.getValue(); |
| } else if (attr.getName().equals(MATCH_VALUE_ATTR)) { |
| value = attr.getValue(); |
| } else if (attr.getName().equals(MATCH_MASK_ATTR)) { |
| mask = attr.getValue(); |
| } |
| } |
| |
| if (value == null) { |
| throw new MimeTypeException("Missing magic byte pattern"); |
| } else if (start < 0 || end < start) { |
| throw new MimeTypeException( |
| "Invalid offset range: [" + start + "," + end + "]"); |
| } |
| |
| byte[] patternBytes = decodeValue(type, value); |
| int length = patternBytes.length; |
| byte[] maskBytes = null; |
| if (mask != null) { |
| maskBytes = decodeValue(type, mask); |
| length = Math.max(patternBytes.length, maskBytes.length); |
| } |
| |
| MagicDetector detector = new MagicDetector( |
| mediaType, patternBytes, maskBytes, start, end); |
| Clause clause = new MagicMatch(detector, length); |
| |
| List<Clause> subClauses = readMatches(element, mediaType); |
| if (subClauses.size() == 0) { |
| return clause; |
| } else if (subClauses.size() == 1) { |
| return new AndClause(clause, subClauses.get(0)); |
| } else { |
| return new AndClause(clause, new OrClause(subClauses)); |
| } |
| } |
| |
| private byte[] decodeValue(String type, String value) |
| throws MimeTypeException { |
| // Preliminary check |
| if ((value == null) || (type == null)) { |
| return null; |
| } |
| |
| byte[] decoded = null; |
| String tmpVal = null; |
| int radix = 8; |
| |
| // hex |
| if (value.startsWith("0x")) { |
| tmpVal = value.substring(2); |
| radix = 16; |
| } else { |
| tmpVal = value; |
| radix = 8; |
| } |
| |
| if (type.equals("string")) { |
| decoded = decodeString(value); |
| |
| } else if (type.equals("byte")) { |
| decoded = tmpVal.getBytes(); |
| |
| } else if (type.equals("host16") || type.equals("little16")) { |
| int i = Integer.parseInt(tmpVal, radix); |
| decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) }; |
| |
| } else if (type.equals("big16")) { |
| int i = Integer.parseInt(tmpVal, radix); |
| decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) }; |
| |
| } else if (type.equals("host32") || type.equals("little32")) { |
| long i = Long.parseLong(tmpVal, radix); |
| decoded = new byte[] { (byte) ((i & 0x000000FF)), |
| (byte) ((i & 0x0000FF00) >> 8), |
| (byte) ((i & 0x00FF0000) >> 16), |
| (byte) ((i & 0xFF000000) >> 24) }; |
| |
| } else if (type.equals("big32")) { |
| long i = Long.parseLong(tmpVal, radix); |
| decoded = new byte[] { (byte) ((i & 0xFF000000) >> 24), |
| (byte) ((i & 0x00FF0000) >> 16), |
| (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x000000FF)) }; |
| } |
| return decoded; |
| } |
| |
| private byte[] decodeString(String value) throws MimeTypeException { |
| if (value.startsWith("0x")) { |
| byte[] bytes = new byte[(value.length() - 2) / 2]; |
| for (int i = 0; i < bytes.length; i++) { |
| bytes[i] = (byte) |
| Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16); |
| } |
| return bytes; |
| } |
| |
| try { |
| ByteArrayOutputStream decoded = new ByteArrayOutputStream(); |
| |
| for (int i = 0; i < value.length(); i++) { |
| if (value.charAt(i) == '\\') { |
| if (value.charAt(i + 1) == '\\') { |
| decoded.write('\\'); |
| i++; |
| } else if (value.charAt(i + 1) == 'x') { |
| decoded.write(Integer.parseInt( |
| value.substring(i + 2, i + 4), 16)); |
| i += 3; |
| } else { |
| int j = i + 1; |
| while ((j < i + 4) && (j < value.length()) |
| && (Character.isDigit(value.charAt(j)))) { |
| j++; |
| } |
| decoded.write(Short.decode( |
| "0" + value.substring(i + 1, j)).byteValue()); |
| i = j - 1; |
| } |
| } else { |
| decoded.write(value.charAt(i)); |
| } |
| } |
| return decoded.toByteArray(); |
| } catch (NumberFormatException e) { |
| throw new MimeTypeException("Invalid string value: " + value, e); |
| } |
| } |
| |
| /** Read Element named root-XML. */ |
| private void readRootXML(Element element, MimeType mimeType) { |
| mimeType.addRootXML(element.getAttribute(NS_URI_ATTR), element |
| .getAttribute(LOCAL_NAME_ATTR)); |
| } |
| |
| } |