| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.jdbc; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.sql.Connection; |
| import java.sql.DriverManager; |
| import java.sql.SQLException; |
| import java.util.List; |
| import java.util.Set; |
| |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| |
| import org.apache.tika.exception.CorruptedFileException; |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.extractor.EmbeddedDocumentUtil; |
| import org.apache.tika.metadata.Database; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.mime.MediaType; |
| import org.apache.tika.parser.AbstractParser; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.sax.XHTMLContentHandler; |
| |
| /** |
| * Abstract class that handles iterating through tables within a database. |
| */ |
| public abstract class AbstractDBParser extends AbstractParser { |
| |
| private final static byte[] EMPTY_BYTE_ARR = new byte[0]; |
| |
| private Connection connection; |
| |
| @Override |
| public Set<MediaType> getSupportedTypes(ParseContext context) { |
| return null; |
| } |
| |
| @Override |
| public void parse(InputStream stream, ContentHandler handler, Metadata metadata, |
| ParseContext context) throws IOException, SAXException, TikaException { |
| connection = getConnection(stream, metadata, context); |
| XHTMLContentHandler xHandler = null; |
| List<String> tableNames = null; |
| EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context); |
| try { |
| tableNames = getTableNames(connection, metadata, context); |
| } catch (SQLException e) { |
| try { |
| close(); |
| } catch (SQLException sqlE) { |
| //swallow |
| } |
| if (e.getClass().toString().contains("SQLiteException") && e.getMessage() != null && |
| (e.getMessage().contains("[SQLITE_ERROR]") || |
| e.getMessage().contains("[SQLITE_CORRUPT]"))) { |
| throw new CorruptedFileException("Corrupt SQLITE", e); |
| } |
| |
| throw new IOException(e); |
| } |
| for (String tableName : tableNames) { |
| //add table names to parent metadata |
| metadata.add(Database.TABLE_NAME, tableName); |
| } |
| xHandler = new XHTMLContentHandler(handler, metadata); |
| xHandler.startDocument(); |
| |
| try { |
| for (String tableName : tableNames) { |
| JDBCTableReader tableReader = |
| getTableReader(connection, tableName, embeddedDocumentUtil); |
| xHandler.startElement("table", "name", tableReader.getTableName()); |
| xHandler.startElement("thead"); |
| xHandler.startElement("tr"); |
| for (String header : tableReader.getHeaders()) { |
| xHandler.startElement("th"); |
| xHandler.characters(header); |
| xHandler.endElement("th"); |
| } |
| xHandler.endElement("tr"); |
| xHandler.endElement("thead"); |
| xHandler.startElement("tbody"); |
| while (tableReader.nextRow(xHandler, context)) { |
| //no-op |
| } |
| xHandler.endElement("tbody"); |
| xHandler.endElement("table"); |
| } |
| } finally { |
| try { |
| close(); |
| } catch (IOException | SQLException e) { |
| //swallow |
| } |
| if (xHandler != null) { |
| xHandler.endDocument(); |
| } |
| } |
| } |
| |
| /** |
| * Override this for any special handling of closing the connection. |
| * |
| * @throws java.sql.SQLException |
| * @throws java.io.IOException |
| */ |
| protected void close() throws SQLException, IOException { |
| connection.close(); |
| } |
| |
| /** |
| * Override this for special configuration of the connection, such as limiting |
| * the number of rows to be held in memory. |
| * |
| * @param stream stream to use |
| * @param metadata metadata that could be used in parameterizing the connection |
| * @param context parsecontext that could be used in parameterizing the connection |
| * @return connection |
| * @throws java.io.IOException |
| * @throws org.apache.tika.exception.TikaException |
| */ |
| protected Connection getConnection(InputStream stream, Metadata metadata, ParseContext context) |
| throws IOException, TikaException { |
| String connectionString = getConnectionString(stream, metadata, context); |
| |
| Connection connection = null; |
| try { |
| Class.forName(getJDBCClassName()); |
| } catch (ClassNotFoundException e) { |
| throw new TikaException(e.getMessage()); |
| } |
| try { |
| connection = DriverManager.getConnection(connectionString); |
| } catch (SQLException e) { |
| throw new IOException(e); |
| } |
| return connection; |
| } |
| |
| /** |
| * Implement for db specific connection information, e.g. "jdbc:sqlite:/docs/mydb.db" |
| * <p/> |
| * Include any optimization settings, user name, password, etc. |
| * <p/> |
| * |
| * @param stream stream for processing |
| * @param metadata metadata might be useful in determining connection info |
| * @param parseContext context to use to help create connectionString |
| * @return connection string to be used by {@link #getConnection}. |
| * @throws java.io.IOException |
| */ |
| abstract protected String getConnectionString(InputStream stream, Metadata metadata, |
| ParseContext parseContext) throws IOException; |
| |
| /** |
| * JDBC class name, e.g. org.sqlite.JDBC |
| * |
| * @return jdbc class name |
| */ |
| abstract protected String getJDBCClassName(); |
| |
| /** |
| * Returns the names of the tables to process |
| * |
| * @param connection Connection to use to make the sql call(s) to get the names of the tables |
| * @param metadata Metadata to use (potentially) in decision about which tables to extract |
| * @param context ParseContext to use (potentially) in decision about which tables to extract |
| * @return |
| * @throws java.sql.SQLException |
| */ |
| abstract protected List<String> getTableNames(Connection connection, Metadata metadata, |
| ParseContext context) throws SQLException; |
| |
| /** |
| * Given a connection and a table name, return the JDBCTableReader for this db. |
| * |
| * @param connection |
| * @param tableName |
| * @return a reader |
| * @deprecated use {@link #getTableReader(Connection, String, EmbeddedDocumentUtil)} |
| */ |
| @Deprecated |
| abstract protected JDBCTableReader getTableReader(Connection connection, String tableName, |
| ParseContext parseContext); |
| |
| /** |
| * Given a connection and a table name, return the JDBCTableReader for this db. |
| * |
| * @param connection |
| * @param tableName |
| * @param embeddedDocumentUtil embedded doc util |
| * @return |
| */ |
| abstract protected JDBCTableReader getTableReader(Connection connection, String tableName, |
| EmbeddedDocumentUtil embeddedDocumentUtil); |
| |
| } |