| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.config; |
| |
| import java.io.File; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.net.URL; |
| import java.util.ArrayList; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import javax.imageio.spi.ServiceRegistry; |
| import javax.xml.parsers.DocumentBuilder; |
| import javax.xml.parsers.DocumentBuilderFactory; |
| import javax.xml.parsers.ParserConfigurationException; |
| |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.mime.MediaType; |
| import org.apache.tika.mime.MediaTypeRegistry; |
| import org.apache.tika.mime.MimeTypeException; |
| import org.apache.tika.mime.MimeTypes; |
| import org.apache.tika.mime.MimeTypesFactory; |
| import org.apache.tika.parser.CompositeParser; |
| import org.apache.tika.parser.DefaultParser; |
| import org.apache.tika.parser.Parser; |
| import org.apache.tika.parser.ParserDecorator; |
| import org.w3c.dom.Document; |
| import org.w3c.dom.Element; |
| import org.w3c.dom.Node; |
| import org.w3c.dom.NodeList; |
| import org.xml.sax.SAXException; |
| |
| /** |
| * Parse xml config file. |
| */ |
| public class TikaConfig { |
| |
| private final CompositeParser parser; |
| |
| private final MimeTypes mimeTypes; |
| |
| private TikaConfig(CompositeParser parser, MimeTypes mimeTypes) { |
| this.parser = parser; |
| this.mimeTypes = mimeTypes; |
| } |
| |
| private TikaConfig(CompositeParser parser) { |
| this(parser, MimeTypes.getDefaultMimeTypes()); |
| } |
| |
| public TikaConfig(String file) |
| throws TikaException, IOException, SAXException { |
| this(new File(file)); |
| } |
| |
| public TikaConfig(File file) |
| throws TikaException, IOException, SAXException { |
| this(getBuilder().parse(file)); |
| } |
| |
| public TikaConfig(URL url) |
| throws TikaException, IOException, SAXException { |
| this(getBuilder().parse(url.toString())); |
| } |
| |
| public TikaConfig(InputStream stream) |
| throws TikaException, IOException, SAXException { |
| this(getBuilder().parse(stream)); |
| } |
| |
| /** |
| * @deprecated This method will be removed in Apache Tika 1.0 |
| * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> |
| */ |
| public TikaConfig(InputStream stream, Parser delegate) |
| throws TikaException, IOException, SAXException { |
| this(stream); |
| } |
| |
| public TikaConfig(Document document) throws TikaException, IOException { |
| this(document.getDocumentElement()); |
| } |
| |
| /** |
| * @deprecated This method will be removed in Apache Tika 1.0 |
| * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> |
| */ |
| public TikaConfig(Document document, Parser delegate) |
| throws TikaException, IOException { |
| this(document); |
| } |
| |
| public TikaConfig(Element element) throws TikaException, IOException { |
| Element mtr = getChild(element, "mimeTypeRepository"); |
| if (mtr != null && mtr.hasAttribute("resource")) { |
| mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource")); |
| } else { |
| mimeTypes = MimeTypes.getDefaultMimeTypes(); |
| } |
| |
| List<Parser> parsers = new ArrayList<Parser>(); |
| NodeList nodes = element.getElementsByTagName("parser"); |
| for (int i = 0; i < nodes.getLength(); i++) { |
| Element node = (Element) nodes.item(i); |
| String name = node.getAttribute("class"); |
| |
| try { |
| Class<?> parserClass = Class.forName(name); |
| Object instance = parserClass.newInstance(); |
| if (!(instance instanceof Parser)) { |
| throw new TikaException( |
| "Configured class is not a Tika Parser: " + name); |
| } |
| Parser parser = (Parser) instance; |
| |
| NodeList mimes = node.getElementsByTagName("mime"); |
| if (mimes.getLength() > 0) { |
| Set<MediaType> types = new HashSet<MediaType>(); |
| for (int j = 0; j < mimes.getLength(); j++) { |
| String mime = getText(mimes.item(j)); |
| MediaType type = MediaType.parse(mime); |
| if (type != null) { |
| types.add(type); |
| } else { |
| throw new TikaException( |
| "Invalid media type name: " + mime); |
| } |
| } |
| parser = ParserDecorator.withTypes(parser, types); |
| } |
| |
| parsers.add(parser); |
| } catch (ClassNotFoundException e) { |
| throw new TikaException( |
| "Configured parser class not found: " + name, e); |
| } catch (IllegalAccessException e) { |
| throw new TikaException( |
| "Unable to access a parser class: " + name, e); |
| } catch (InstantiationException e) { |
| throw new TikaException( |
| "Unable to instantiate a parser class: " + name, e); |
| } |
| } |
| this.parser = |
| new CompositeParser(mimeTypes.getMediaTypeRegistry(), parsers); |
| } |
| |
| /** |
| * Creates a Tika configuration from the built-in media type rules |
| * and all the {@link Parser} implementations available through the |
| * {@link ServiceRegistry service provider mechanism} in the given |
| * class loader. |
| * |
| * @since Apache Tika 0.8 |
| * @param loader the class loader through which parser implementations |
| * are loaded, or <code>null</code> for no parsers |
| * @throws MimeTypeException if the built-in media type rules are broken |
| * @throws IOException if the built-in media type rules can not be read |
| */ |
| public TikaConfig(ClassLoader loader) |
| throws MimeTypeException, IOException { |
| this(new DefaultParser(loader)); |
| } |
| |
| /** |
| * Creates a Tika configuration from the built-in media type rules |
| * and all the {@link Parser} implementations available through the |
| * {@link ServiceRegistry service provider mechanism} in the context |
| * class loader of the current thread. |
| * |
| * @throws MimeTypeException if the built-in media type rules are broken |
| * @throws IOException if the built-in media type rules can not be read |
| */ |
| public TikaConfig() throws MimeTypeException, IOException { |
| this(new DefaultParser()); |
| } |
| |
| /** |
| * @deprecated This method will be removed in Apache Tika 1.0 |
| * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> |
| */ |
| public TikaConfig(Element element, Parser delegate) |
| throws TikaException, IOException { |
| this(element); |
| } |
| |
| private String getText(Node node) { |
| if (node.getNodeType() == Node.TEXT_NODE) { |
| return node.getNodeValue(); |
| } else if (node.getNodeType() == Node.ELEMENT_NODE) { |
| StringBuilder builder = new StringBuilder(); |
| NodeList list = node.getChildNodes(); |
| for (int i = 0; i < list.getLength(); i++) { |
| builder.append(getText(list.item(i))); |
| } |
| return builder.toString(); |
| } else { |
| return ""; |
| } |
| } |
| |
| /** |
| * @deprecated Use the {@link #getParser()} method instead |
| */ |
| public Parser getParser(MediaType mimeType) { |
| return parser.getParsers().get(mimeType); |
| } |
| |
| /** |
| * Returns the configured parser instance. |
| * |
| * @return configured parser |
| */ |
| public Parser getParser() { |
| return parser; |
| } |
| |
| /** |
| * @deprecated Use the {@link #getParser()} method instead |
| */ |
| public Map<MediaType, Parser> getParsers() { |
| return parser.getParsers(); |
| } |
| |
| public MimeTypes getMimeRepository(){ |
| return mimeTypes; |
| } |
| |
| public MediaTypeRegistry getMediaTypeRegistry() { |
| return mimeTypes.getMediaTypeRegistry(); |
| } |
| |
| /** |
| * Provides a default configuration (TikaConfig). Currently creates a |
| * new instance each time it's called; we may be able to have it |
| * return a shared instance once it is completely immutable. |
| * |
| * @return default configuration |
| */ |
| public static TikaConfig getDefaultConfig() { |
| try { |
| return new TikaConfig(); |
| } catch (IOException e) { |
| throw new RuntimeException( |
| "Unable to read default configuration", e); |
| } catch (TikaException e) { |
| throw new RuntimeException( |
| "Unable to access default configuration", e); |
| } |
| } |
| |
| /** |
| * @deprecated This method will be removed in Apache Tika 1.0 |
| * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> |
| */ |
| public static TikaConfig getDefaultConfig(Parser delegate) |
| throws TikaException { |
| return getDefaultConfig(); |
| } |
| |
| private static DocumentBuilder getBuilder() throws TikaException { |
| try { |
| return DocumentBuilderFactory.newInstance().newDocumentBuilder(); |
| } catch (ParserConfigurationException e) { |
| throw new TikaException("XML parser not available", e); |
| } |
| } |
| |
| private static Element getChild(Element element, String name) { |
| Node child = element.getFirstChild(); |
| while (child != null) { |
| if (child.getNodeType() == Node.ELEMENT_NODE |
| && name.equals(child.getNodeName())) { |
| return (Element) child; |
| } |
| child = child.getNextSibling(); |
| } |
| return null; |
| } |
| |
| } |