blob: 615a981af08b90b07d081fb367739ac9b4dd10d9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.strings;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.IOUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.config.Param;
import org.apache.tika.detect.FileCommandDetector;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.SystemUtils;
/**
* Parser that uses the "strings" (or strings-alternative) command to find the
* printable strings in a object, or other binary, file
* (application/octet-stream). Useful as "best-effort" parser for files detected
* as application/octet-stream.
*
* @author gtotaro
*/
public class StringsParser extends AbstractParser implements Initializable {
/**
* Serial version UID
*/
private static final long serialVersionUID = 802566634661575025L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.OCTET_STREAM);
private final StringsConfig defaultStringsConfig = new StringsConfig();
private String filePath = "";
private FileCommandDetector fileCommandDetector;
private boolean stringsPresent = false;
private boolean hasEncodingOption = false;//whether or not the strings app allows -e
private String stringsPath = "";
public static String getStringsProg() {
return SystemUtils.IS_OS_WINDOWS ? "strings.exe" : "strings";
}
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
if (!stringsPresent) {
return;
}
StringsConfig stringsConfig = context.get(StringsConfig.class, defaultStringsConfig);
try (TemporaryResources tmp = new TemporaryResources()) {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
File input = tis.getFile();
// Metadata
metadata.set("strings:min-len", "" + stringsConfig.getMinLength());
metadata.set("strings:encoding", stringsConfig.toString());
metadata.set("strings:file_output", doFile(tis));
int totalBytes = 0;
// Content
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
totalBytes = doStrings(input, stringsConfig, xhtml);
xhtml.endDocument();
// Metadata
metadata.set("strings:length", "" + totalBytes);
}
}
private String doFile(TikaInputStream tis) throws IOException {
Metadata tmpMetadata = new Metadata();
fileCommandDetector.detect(tis, tmpMetadata);
return tmpMetadata.get(Metadata.CONTENT_TYPE);
}
/**
* Checks if the "strings" command is supported.
*
* @return Returns returns {@code true} if the strings command is supported.
*/
private void checkForStrings() {
String stringsProg = getStringsPath() + getStringsProg();
String[] checkCmd = {stringsProg, "--version"};
try {
stringsPresent = ExternalParser.check(checkCmd);
if (!stringsPresent) {
return;
}
// Check if the -e option (encoding) is supported
if (!SystemUtils.IS_OS_WINDOWS) {
String[] checkOpt =
{stringsProg, "-e", "" + defaultStringsConfig.getEncoding().get(),
"/dev/null"};
int[] errorValues =
{1, 2}; // Exit status code: 1 = general error; 2 = incorrect usage.
hasEncodingOption = ExternalParser.check(checkOpt, errorValues);
}
} catch (NoClassDefFoundError ncdfe) {
// This happens under OSGi + Fork Parser - see TIKA-1507
// As a workaround for now, just say we can't use strings
// TODO Resolve it so we don't need this try/catch block
}
}
/**
* Runs the "strings" command on the given file.
*
* @param input {@see File} object that represents the file to parse.
* @param config {@see StringsConfig} object including the strings
* configuration.
* @param xhtml {@see XHTMLContentHandler} object.
* @return the total number of bytes read using the strings command.
* @throws IOException if any I/O error occurs.
* @throws TikaException if the parsing process has been interrupted.
* @throws SAXException
*/
private int doStrings(File input, StringsConfig config, XHTMLContentHandler xhtml)
throws IOException, TikaException, SAXException {
String stringsProg = getStringsPath() + getStringsProg();
// Builds the command array
ArrayList<String> cmdList = new ArrayList<>(4);
cmdList.add(stringsProg);
cmdList.add("-n");
cmdList.add("" + config.getMinLength());
;
// Currently, encoding option is not supported by Windows (and other) versions
if (hasEncodingOption) {
cmdList.add("-e");
cmdList.add("" + config.getEncoding().get());
}
cmdList.add(input.getPath());
String[] cmd = cmdList.toArray(new String[0]);
ProcessBuilder pb = new ProcessBuilder(cmd);
final Process process = pb.start();
InputStream out = process.getInputStream();
AtomicInteger totalBytes = new AtomicInteger();
// Reads content printed out by "strings" command
Thread gobbler = logStream(out, xhtml, totalBytes);
gobbler.start();
try {
boolean completed = process.waitFor(config.getTimeoutSeconds(), TimeUnit.SECONDS);
if (!completed) {
throw new TimeoutException("timed out");
}
gobbler.join(10000);
} catch (InterruptedException | TimeoutException e) {
throw new TikaException("strings process failed", e);
} finally {
process.destroyForcibly();
}
return totalBytes.get();
}
private Thread logStream(final InputStream stream, final ContentHandler handler,
final AtomicInteger totalBytes) {
return new Thread(() -> {
Reader reader = new InputStreamReader(stream, UTF_8);
char[] buffer = new char[1024];
try {
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
handler.characters(buffer, 0, n);
totalBytes.addAndGet(n);
}
} catch (SAXException | IOException e) {
//swallow
} finally {
IOUtils.closeQuietly(stream);
}
});
}
public String getStringsPath() {
return stringsPath;
}
/**
* Sets the "strings" installation folder.
*
* @param path the "strings" installation folder.
*/
@Field
public void setStringsPath(String path) {
if (!path.isEmpty() && !path.endsWith(File.separator)) {
path += File.separatorChar;
}
this.stringsPath = path;
}
@Field
public void setEncoding(String encoding) {
defaultStringsConfig.setEncoding(StringsEncoding.valueOf(encoding));
}
public int getMinLength() {
return defaultStringsConfig.getMinLength();
}
@Field
public void setMinLength(int minLength) {
defaultStringsConfig.setMinLength(minLength);
}
public int getTimeoutSeconds() {
return defaultStringsConfig.getTimeoutSeconds();
}
@Field
public void setTimeoutSeconds(int timeoutSeconds) {
defaultStringsConfig.setTimeoutSeconds(timeoutSeconds);
}
public StringsEncoding getStringsEncoding() {
return defaultStringsConfig.getEncoding();
}
@Override
public void initialize(Map<String, Param> params) throws TikaConfigException {
checkForStrings();
fileCommandDetector = new FileCommandDetector();
fileCommandDetector.setFilePath(filePath);
fileCommandDetector.setTimeoutMs(defaultStringsConfig.getTimeoutSeconds() * 1000);
}
@Override
public void checkInitialization(InitializableProblemHandler problemHandler)
throws TikaConfigException {
}
}