blob: 63c089abaaf0e6cb1b274d732c82c7d3a70a83bd [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.jdbc;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.List;
import java.util.Set;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Database;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Abstract class that handles iterating through tables within a database.
*/
public abstract class AbstractDBParser extends AbstractParser {
private final static byte[] EMPTY_BYTE_ARR = new byte[0];
private Connection connection;
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return null;
}
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
connection = getConnection(stream, metadata, context);
XHTMLContentHandler xHandler = null;
List<String> tableNames = null;
EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
try {
tableNames = getTableNames(connection, metadata, context);
} catch (SQLException e) {
try {
close();
} catch (SQLException sqlE) {
//swallow
}
if (e.getClass().toString().contains("SQLiteException") && e.getMessage() != null &&
(e.getMessage().contains("[SQLITE_ERROR]") ||
e.getMessage().contains("[SQLITE_CORRUPT]"))) {
throw new CorruptedFileException("Corrupt SQLITE", e);
}
throw new IOException(e);
}
for (String tableName : tableNames) {
//add table names to parent metadata
metadata.add(Database.TABLE_NAME, tableName);
}
xHandler = new XHTMLContentHandler(handler, metadata);
xHandler.startDocument();
try {
for (String tableName : tableNames) {
JDBCTableReader tableReader =
getTableReader(connection, tableName, embeddedDocumentUtil);
xHandler.startElement("table", "name", tableReader.getTableName());
xHandler.startElement("thead");
xHandler.startElement("tr");
for (String header : tableReader.getHeaders()) {
xHandler.startElement("th");
xHandler.characters(header);
xHandler.endElement("th");
}
xHandler.endElement("tr");
xHandler.endElement("thead");
xHandler.startElement("tbody");
while (tableReader.nextRow(xHandler, context)) {
//no-op
}
xHandler.endElement("tbody");
xHandler.endElement("table");
}
} finally {
try {
close();
} catch (IOException | SQLException e) {
//swallow
}
if (xHandler != null) {
xHandler.endDocument();
}
}
}
/**
* Override this for any special handling of closing the connection.
*
* @throws java.sql.SQLException
* @throws java.io.IOException
*/
protected void close() throws SQLException, IOException {
connection.close();
}
/**
* Override this for special configuration of the connection, such as limiting
* the number of rows to be held in memory.
*
* @param stream stream to use
* @param metadata metadata that could be used in parameterizing the connection
* @param context parsecontext that could be used in parameterizing the connection
* @return connection
* @throws java.io.IOException
* @throws org.apache.tika.exception.TikaException
*/
protected Connection getConnection(InputStream stream, Metadata metadata, ParseContext context)
throws IOException, TikaException {
String connectionString = getConnectionString(stream, metadata, context);
Connection connection = null;
try {
Class.forName(getJDBCClassName());
} catch (ClassNotFoundException e) {
throw new TikaException(e.getMessage());
}
try {
connection = DriverManager.getConnection(connectionString);
} catch (SQLException e) {
throw new IOException(e);
}
return connection;
}
/**
* Implement for db specific connection information, e.g. "jdbc:sqlite:/docs/mydb.db"
* <p/>
* Include any optimization settings, user name, password, etc.
* <p/>
*
* @param stream stream for processing
* @param metadata metadata might be useful in determining connection info
* @param parseContext context to use to help create connectionString
* @return connection string to be used by {@link #getConnection}.
* @throws java.io.IOException
*/
abstract protected String getConnectionString(InputStream stream, Metadata metadata,
ParseContext parseContext) throws IOException;
/**
* JDBC class name, e.g. org.sqlite.JDBC
*
* @return jdbc class name
*/
abstract protected String getJDBCClassName();
/**
* Returns the names of the tables to process
*
* @param connection Connection to use to make the sql call(s) to get the names of the tables
* @param metadata Metadata to use (potentially) in decision about which tables to extract
* @param context ParseContext to use (potentially) in decision about which tables to extract
* @return
* @throws java.sql.SQLException
*/
abstract protected List<String> getTableNames(Connection connection, Metadata metadata,
ParseContext context) throws SQLException;
/**
* Given a connection and a table name, return the JDBCTableReader for this db.
*
* @param connection
* @param tableName
* @return a reader
* @deprecated use {@link #getTableReader(Connection, String, EmbeddedDocumentUtil)}
*/
@Deprecated
abstract protected JDBCTableReader getTableReader(Connection connection, String tableName,
ParseContext parseContext);
/**
* Given a connection and a table name, return the JDBCTableReader for this db.
*
* @param connection
* @param tableName
* @param embeddedDocumentUtil embedded doc util
* @return
*/
abstract protected JDBCTableReader getTableReader(Connection connection, String tableName,
EmbeddedDocumentUtil embeddedDocumentUtil);
}