blob: 44f7b1d6f69d48e8879bee276558468d81f9a1e8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one *
* or more contributor license agreements. See the NOTICE file *
* distributed with this work for additional information *
* regarding copyright ownership. The ASF licenses this file *
* to you under the Apache License, Version 2.0 (the *
* "License"); you may not use this file except in compliance *
* with the License. You may obtain a copy of the License at *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, *
* software distributed under the License is distributed on an *
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
* KIND, either express or implied. See the License for the *
* specific language governing permissions and limitations *
* under the License. *
*/
package org.apache.rat.document.impl.guesser;
import org.apache.commons.io.IOUtils;
import org.apache.rat.api.Document;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Locale;
/**
* TODO: factor into MIME guesser and MIME->binary guesser
*/
public class BinaryGuesser {
private static final String DOT = ".";
static final String FILE_ENCODING = "file.encoding";
private static Charset CHARSET_FROM_FILE_ENCODING_OR_UTF8 = getFileEncodingOrUTF8AsFallback();
private static boolean isBinaryDocument(Document document) {
boolean result = false;
InputStream stream = null;
try {
stream = document.inputStream();
result = isBinary(stream);
} catch (IOException e) {
result = false;
} finally {
IOUtils.closeQuietly(stream);
}
return result;
}
private static boolean isBinary(CharSequence taste) {
int highBytes = 0;
final int length = taste.length();
for (int i = 0; i < length; i++) {
char c = taste.charAt(i);
if (c > BinaryGuesser.NON_ASCII_THREASHOLD
|| c <= BinaryGuesser.ASCII_CHAR_THREASHOLD) {
highBytes++;
}
}
return highBytes * BinaryGuesser.HIGH_BYTES_RATIO
> length * BinaryGuesser.TOTAL_READ_RATIO;
}
/**
* @param in the file to check.
* @return Do the first few bytes of the stream hint at a binary file?
* <p>Any IOException is swallowed internally and the test returns
* false.</p>
* <p>This method may lead to false negatives if the reader throws
* an exception because it can't read characters according to the
* reader's encoding from the underlying stream.</p>
*/
public static boolean isBinary(Reader in) {
char[] taste = new char[100];
try {
int bytesRead = in.read(taste);
if (bytesRead > 0) {
return isBinary(new String(taste, 0, bytesRead));
}
} catch (IOException e) {
// SWALLOW
}
return false;
}
/**
* @param in the file to check.
* @return Do the first few bytes of the stream hint at a binary file?
* <p>Any IOException is swallowed internally and the test returns
* false.</p>
* <p>This method will try to read bytes from the stream and
* translate them to characters according to the platform's
* default encoding. If any bytes can not be translated to
* characters it will assume the original data must be binary and
* return true.</p>
*/
public static boolean isBinary(InputStream in) {
try {
byte[] taste = new byte[200];
int bytesRead = in.read(taste);
if (bytesRead > 0) {
ByteBuffer bytes = ByteBuffer.wrap(taste, 0, bytesRead);
CharBuffer chars = CharBuffer.allocate(2 * bytesRead);
CharsetDecoder cd = CHARSET_FROM_FILE_ENCODING_OR_UTF8.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
while (bytes.remaining() > 0) {
CoderResult res = cd.decode(bytes, chars, true);
if (res.isMalformed() || res.isUnmappable()) {
return true;
} else if (res.isOverflow()) {
chars.limit(chars.position());
chars.rewind();
int c = chars.capacity() * 2;
CharBuffer on = CharBuffer.allocate(c);
on.put(chars);
chars = on;
}
}
chars.limit(chars.position());
chars.rewind();
return isBinary(chars);
}
} catch (IOException e) {
// SWALLOW
}
return false;
}
static Charset getFileEncodingOrUTF8AsFallback() {
try {
return Charset.forName(System.getProperty(FILE_ENCODING));
} catch (UnsupportedCharsetException e) {
return Charset.forName("UTF-8");
}
}
/**
* @param name current file name.
* @return whether given name is binary.
*/
public static final boolean isBinaryData(final String name) {
return extensionMatches(name, DATA_EXTENSIONS);
}
/**
* @param name current file name.
* @return Is a file by that name a known non-binary file?
*/
public static final boolean isNonBinary(final String name) {
if (name == null) {
return false;
}
return extensionMatches(name.toUpperCase(Locale.US),
BinaryGuesser.NON_BINARY_EXTENSIONS);
}
/**
* @param name current file name.
* @return Is a file by that name an executable/binary file?
*/
public static final boolean isExecutable(final String name) {
return name.equals(BinaryGuesser.JAVA) || extensionMatches(name, EXE_EXTENSIONS)
|| containsExtension(name, EXE_EXTENSIONS);
}
public static boolean containsExtension(final String name,
final String[] exts) {
for (int i = 0; i < exts.length; i++) {
if (name.contains(DOT + exts[i] + DOT)) {
return true;
}
}
return false;
}
public static boolean extensionMatches(final String name,
final String[] exts) {
for (int i = 0; i < exts.length; i++) {
if (name.endsWith(DOT + exts[i])) {
return true;
}
}
return false;
}
public static boolean isBytecode(final String name) {
return BinaryGuesser.extensionMatches(name, BYTECODE_EXTENSIONS);
}
public static final boolean isImage(final String name) {
return BinaryGuesser.extensionMatches(name, IMAGE_EXTENSIONS);
}
public static final boolean isKeystore(final String name) {
return BinaryGuesser.extensionMatches(name, KEYSTORE_EXTENSIONS);
}
/**
* @param name file name.
* @return Is a file by that name a known binary file?
*/
public static final boolean isBinary(final String name) {
if (name == null) {
return false;
}
String normalisedName = GuessUtils.normalise(name);
return BinaryGuesser.JAR_MANIFEST.equalsIgnoreCase(name) || BinaryGuesser.isImage(normalisedName)
|| BinaryGuesser.isKeystore(normalisedName) || BinaryGuesser.isBytecode(normalisedName)
|| BinaryGuesser.isBinaryData(normalisedName) || BinaryGuesser.isExecutable(normalisedName);
}
private static final String[] DATA_EXTENSIONS = {
"DAT", "DOC",
"NCB", "IDB",
"SUO", "XCF",
"RAJ", "CERT",
"KS", "TS",
"ODP", "SWF"
};
private static final String[] EXE_EXTENSIONS = {
"EXE", "DLL",
"LIB", "SO",
"A", "EXP",
};
private static final String[] KEYSTORE_EXTENSIONS = {
"JKS", "KEYSTORE", "PEM", "CRL", "TRUSTSTORE"
};
private static final String[] IMAGE_EXTENSIONS = {
"PNG", "PDF",
"GIF", "GIFF",
"TIF", "TIFF",
"JPG", "JPEG",
"ICO", "ICNS",
"PSD",
};
private static final String[] BYTECODE_EXTENSIONS = {
"CLASS", "PYD",
"OBJ", "PYC",
};
/**
* Based on http://www.apache.org/dev/svn-eol-style.txt
*/
private static final String[] NON_BINARY_EXTENSIONS = {
"AART",
"AC",
"AM",
"BAT",
"C",
"CAT",
"CGI",
"CLASSPATH",
"CMD",
"CONFIG",
"CPP",
"CSS",
"CWIKI",
"DATA",
"DCL",
"DTD",
"EGRM",
"ENT",
"FT",
"FN",
"FV",
"GRM",
"G",
"H",
"HTACCESS",
"HTML",
"IHTML",
"IN",
"JAVA",
"JMX",
"JSP",
"JS",
"JUNIT",
"JX",
"MANIFEST",
"M4",
"MF",
"MF",
"META",
"MOD",
"N3",
"PEN",
"PL",
"PM",
"POD",
"POM",
"PROJECT",
"PROPERTIES",
"PY",
"RB",
"RDF",
"RNC",
"RNG",
"RNX",
"ROLES",
"RSS",
"SH",
"SQL",
"SVG",
"TLD",
"TXT",
"TYPES",
"VM",
"VSL",
"WSDD",
"WSDL",
"XARGS",
"XCAT",
"XCONF",
"XEGRM",
"XGRM",
"XLEX",
"XLOG",
"XMAP",
"XML",
"XROLES",
"XSAMPLES",
"XSD",
"XSL",
"XSLT",
"XSP",
"XUL",
"XWEB",
"XWELCOME",
};
public static final String JAR_MANIFEST = "MANIFEST.MF";
public static final String JAVA = "JAVA";
public static final int HIGH_BYTES_RATIO = 100;
public static final int TOTAL_READ_RATIO = 30;
public static final int NON_ASCII_THREASHOLD = 256;
public static final int ASCII_CHAR_THREASHOLD = 8;
public static final boolean isBinary(final Document document) {
// TODO: reimplement the binary test algorithm?
// TODO: more efficient to move into standard analysis
// TODO: then use binary as default
return isBinary(document.getName())
||
// try a taste
isBinaryDocument(document);
}
}