blob: 3ed34b9d6f7075ff90e1dddec387892d5e066865 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PipedReader;
import java.io.PipedWriter;
import java.io.Reader;
import java.io.Writer;
import java.util.concurrent.Executor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
/**
* Reader for the text content from a given binary stream. This class
* uses a background parsing task with a {@link Parser}
* ({@link AutoDetectParser} by default) to parse the text content from
* a given input stream. The {@link BodyContentHandler} class and a pipe
* is used to convert the push-based SAX event stream to the pull-based
* character stream defined by the {@link Reader} interface.
*
* @since Apache Tika 0.2
*/
public class ParsingReader extends Reader {
/**
* Parser instance used for parsing the given binary stream.
*/
private final Parser parser;
/**
* Buffered read end of the pipe.
*/
private final Reader reader;
/**
* Write end of the pipe.
*/
private final Writer writer;
/**
* The binary stream being parsed.
*/
private final InputStream stream;
/**
* Metadata associated with the document being parsed.
*/
private final Metadata metadata;
/**
* The parse context.
*/
private final ParseContext context;
/**
* An exception (if any) thrown by the parsing thread.
*/
private transient Throwable throwable;
/**
* Utility method that returns a {@link Metadata} instance
* for a document with the given name.
*
* @param name resource name (or <code>null</code>)
* @return metadata instance
*/
private static Metadata getMetadata(String name) {
Metadata metadata = new Metadata();
if (name != null && name.length() > 0) {
metadata.set(Metadata.RESOURCE_NAME_KEY, name);
}
return metadata;
}
/**
* Creates a reader for the text content of the given binary stream.
*
* @param stream binary stream
* @throws IOException if the document can not be parsed
*/
public ParsingReader(InputStream stream) throws IOException {
this(new AutoDetectParser(), stream, new Metadata());
}
/**
* Creates a reader for the text content of the given binary stream
* with the given name.
*
* @param stream binary stream
* @param name document name
* @throws IOException if the document can not be parsed
*/
public ParsingReader(InputStream stream, String name) throws IOException {
this(new AutoDetectParser(), stream, getMetadata(name));
}
/**
* Creates a reader for the text content of the given file.
*
* @param file file
* @throws FileNotFoundException if the given file does not exist
* @throws IOException if the document can not be parsed
*/
public ParsingReader(File file) throws FileNotFoundException, IOException {
this(new FileInputStream(file), file.getName());
}
/**
* Creates a reader for the text content of the given binary stream
* with the given document metadata. The given parser is used for
* parsing. A new background thread is started for the parsing task.
*
* @param parser parser instance
* @param stream binary stream
* @param metadata document metadata
* @throws IOException if the document can not be parsed
*/
public ParsingReader(
Parser parser, InputStream stream, final Metadata metadata,
ParseContext context) throws IOException {
this(parser, stream, metadata, context, new Executor() {
public void execute(Runnable command) {
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null) {
name = "Apache Tika: " + name;
} else {
name = "Apache Tika";
}
Thread thread = new Thread(command, name);
thread.setDaemon(true);
thread.start();
}
});
}
/**
* Creates a reader for the text content of the given binary stream
* with the given document metadata. The given parser is used for the
* parsing task that is run with the given executor. The given executor
* <em>must</em> run the parsing task asynchronously in a separate thread,
* since the current thread must return to the caller that can then
* consume the parsed text through the {@link Reader} interface.
*
* @param parser parser instance
* @param stream binary stream
* @param metadata document metadata
* @param context parsing context
* @param executor executor for the parsing task
* @throws IOException if the document can not be parsed
* @since Apache Tika 0.4
*/
public ParsingReader(
Parser parser, InputStream stream, Metadata metadata,
ParseContext context, Executor executor) throws IOException {
this.parser = parser;
PipedReader pipedReader = new PipedReader();
this.reader = new BufferedReader(pipedReader);
try {
this.writer = new PipedWriter(pipedReader);
} catch (IOException e) {
throw new IllegalStateException(e); // Should never happen
}
this.stream = stream;
this.metadata = metadata;
this.context = context;
executor.execute(new ParsingTask());
// TIKA-203: Buffer first character to force metadata extraction
reader.mark(1);
reader.read();
reader.reset();
}
/**
* @deprecated This method will be removed in Apache Tika 1.0
* @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
*/
public ParsingReader(Parser parser, InputStream stream, Metadata metadata)
throws IOException {
this(parser, stream, metadata, new ParseContext());
context.set(Parser.class, parser);
}
/**
* @deprecated This method will be removed in Apache Tika 1.0
* @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
*/
public ParsingReader(
Parser parser, InputStream stream, Metadata metadata,
Executor executor) throws IOException {
this(parser, stream, metadata, new ParseContext(), executor);
context.set(Parser.class, parser);
}
/**
* The background parsing task.
*/
private class ParsingTask implements Runnable {
/**
* Parses the given binary stream and writes the text content
* to the write end of the pipe. Potential exceptions (including
* the one caused if the read end is closed unexpectedly) are
* stored before the input stream is closed and processing is stopped.
*/
public void run() {
try {
ContentHandler handler = new BodyContentHandler(writer);
parser.parse(stream, handler, metadata, context);
} catch (Throwable t) {
throwable = t;
}
try {
stream.close();
} catch (Throwable t) {
if (throwable == null) {
throwable = t;
}
}
try {
writer.close();
} catch (Throwable t) {
if (throwable == null) {
throwable = t;
}
}
}
}
/**
* Reads parsed text from the pipe connected to the parsing thread.
* Fails if the parsing thread has thrown an exception.
*
* @param cbuf character buffer
* @param off start offset within the buffer
* @param len maximum number of characters to read
* @throws IOException if the parsing thread has failed or
* if for some reason the pipe does not work properly
*/
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
if (throwable instanceof IOException) {
throw (IOException) throwable;
} else if (throwable != null) {
IOException exception = new IOException("");
exception.initCause(throwable);
throw exception;
}
return reader.read(cbuf, off, len);
}
/**
* Closes the read end of the pipe. If the parsing thread is still
* running, next write to the pipe will fail and cause the thread
* to stop. Thus there is no need to explicitly terminate the thread.
*
* @throws IOException if the pipe can not be closed
*/
@Override
public void close() throws IOException {
reader.close();
}
}