blob: 19352cfc017a711794971a4f37131941f7989cba [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.digestutils;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import org.apache.commons.codec.binary.Base32;
import org.apache.commons.codec.binary.Hex;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.digest.CompositeDigester;
import org.apache.tika.parser.digest.InputStreamDigester;
/**
* Implementation of {@link org.apache.tika.parser.DigestingParser.Digester}
* that relies on commons.codec.digest.DigestUtils to calculate digest hashes.
* <p>
* This digester tries to use the regular mark/reset protocol on the InputStream.
* However, this wraps an internal BoundedInputStream, and if the InputStream
* is not fully read, then this will reset the stream and
* spool the InputStream to disk (via TikaInputStream) and then digest the file.
*/
public class CommonsDigester extends CompositeDigester {
/**
* Include a string representing the comma-separated algorithms to run: e.g. "md5,sha1".
* If you want base 32 encoding instead of hexadecimal, add ":32" to the algorithm, e.g.
* "md5,sha1:32"
* <p/>
* Will throw an IllegalArgumentException if an algorithm isn't supported
*
* @param markLimit
* @param algorithmString
*/
public CommonsDigester(int markLimit, String algorithmString) {
super(buildDigesters(markLimit, algorithmString));
}
/**
* @param markLimit limit for mark/reset; after this limit is hit, the
* stream is reset and spooled to disk
* @param algorithms algorithms to run
* @deprecated use {@link #CommonsDigester(int, String)}
*/
public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) {
super(buildDigesters(markLimit, algorithms));
}
private static DigestingParser.Digester[] buildDigesters(int markLimit,
DigestAlgorithm[] algorithms) {
DigestingParser.Digester[] digesters = new DigestingParser.Digester[algorithms.length];
int i = 0;
for (DigestAlgorithm algorithm : algorithms) {
digesters[i++] =
new InputStreamDigester(markLimit, algorithm.getJavaName(), algorithm.name(),
new HexEncoder());
}
return digesters;
}
/**
* This returns digest algorithms only. It does not understand the encoding
* syntax, e.g. "MD5:32" (base 32 encoding of MD5). To parse
* those, see {@link #CommonsDigester(int, String)}.
*
* @param s comma-delimited (no space) list of algorithms to use: md5,sha256.
* @return
* @deprecated use the {@link #CommonsDigester(int, String)} instead
*/
@Deprecated
public static DigestAlgorithm[] parse(String s) {
assert (s != null);
List<DigestAlgorithm> ret = new ArrayList<>();
for (String algoString : s.split(",")) {
ret.add(getDigestAlgorithm(algoString));
}
return ret.toArray(new DigestAlgorithm[0]);
}
private static DigestAlgorithm getDigestAlgorithm(String algoString) {
String uc = algoString.toUpperCase(Locale.ROOT);
if (uc.equals(DigestAlgorithm.MD2.toString())) {
return DigestAlgorithm.MD2;
} else if (uc.equals(DigestAlgorithm.MD5.toString())) {
return DigestAlgorithm.MD5;
} else if (uc.equals(DigestAlgorithm.SHA1.toString())) {
return DigestAlgorithm.SHA1;
} else if (uc.equals(DigestAlgorithm.SHA256.toString())) {
return DigestAlgorithm.SHA256;
} else if (uc.equals(DigestAlgorithm.SHA384.toString())) {
return DigestAlgorithm.SHA384;
} else if (uc.equals(DigestAlgorithm.SHA512.toString())) {
return DigestAlgorithm.SHA512;
} else {
StringBuilder sb = new StringBuilder();
int i = 0;
for (DigestAlgorithm algo : DigestAlgorithm.values()) {
if (i++ > 0) {
sb.append(", ");
}
sb.append(algo.toString());
}
throw new IllegalArgumentException(
"Couldn't match " + algoString + " with any of: " + sb.toString());
}
}
private static DigestingParser.Digester[] buildDigesters(int markLimit, String digesterDef) {
String[] digests = digesterDef.split(",");
DigestingParser.Digester[] digesters = new DigestingParser.Digester[digests.length];
int i = 0;
for (String digest : digests) {
String[] parts = digest.split(":");
DigestingParser.Encoder encoder = null;
if (parts.length > 1) {
if (parts[1].equals("16")) {
encoder = new HexEncoder();
} else if (parts[1].equals("32")) {
encoder = new Base32Encoder();
} else {
throw new IllegalArgumentException("Value must be '16' or '32'");
}
} else {
encoder = new HexEncoder();
}
DigestAlgorithm digestAlgorithm = getDigestAlgorithm(parts[0]);
digesters[i++] = new InputStreamDigester(markLimit, digestAlgorithm.getJavaName(),
digestAlgorithm.name(), encoder);
}
return digesters;
}
public enum DigestAlgorithm {
//those currently available in commons.digest
MD2("MD2"), MD5("MD5"), SHA1("SHA-1"), SHA256("SHA-256"), SHA384("SHA-384"),
SHA512("SHA-512");
private final String javaName;
DigestAlgorithm(String javaName) {
this.javaName = javaName;
}
String getJavaName() {
return javaName;
}
String getMetadataKey() {
return TikaCoreProperties.TIKA_META_PREFIX + "digest" +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + this.toString();
}
}
private static class HexEncoder implements DigestingParser.Encoder {
@Override
public String encode(byte[] bytes) {
return Hex.encodeHexString(bytes);
}
}
private static class Base32Encoder implements DigestingParser.Encoder {
@Override
public String encode(byte[] bytes) {
return new Base32().encodeToString(bytes);
}
}
}