blob: cbd77c2ef2e3316652ab3d194e046b375d40f615 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.vault.fs.impl.io;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.Deflater;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;
import javax.annotation.Nonnull;
import javax.jcr.RepositoryException;
import org.apache.commons.io.IOUtils;
import org.apache.jackrabbit.vault.fs.api.Artifact;
import org.apache.jackrabbit.vault.fs.api.SerializationType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* {@code CompressionUtil} is a utility class that allows to evaluate
* the compressibility of artifacts.
*/
public final class CompressionUtil {
/**
* default logger
*/
private static final Logger log = LoggerFactory.getLogger(CompressionUtil.class);
/**
* If set to true the compression level can be changed for individual jar entries, otherwise false.
*/
public static final boolean ENV_SUPPORTS_COMPRESSION_LEVEL_CHANGE = checkEnvironmentSupportsCompressionSwitch();
/**
* Minimum length to run the auto-detection algorithm in Byte.
*/
private static final long MIN_AUTO_DETECTION_LENGTH = 115 * 1024;
/**
* Length of the sample (in byte) peeked from the artifact for running the auto-detection algorithm.
*/
private static final int SAMPLE_LENGTH = 256;
// TODO extend the MIME type lists
/**
* List of well known mime types identifying to compressed formats.
*/
private static final Set<String> INCOMPRESSIBLE_MIME_TYPES = new HashSet<String>(Arrays.asList(
"image/gif",
"image/jpeg",
"image/png",
"multipart/x-gzip",
"video/mp4",
"application/gzip",
"application/java-archive",
"application/mp4",
"application/x-7z-compressed",
"application/x-compressed",
"application/x-gzip",
"application/x-rar-compressed",
"application/zip",
"application/zlib",
"audio/mpeg"
));
/**
* List of well known mime types identifying to non compressed formats.
*/
private static final Set<String> COMPRESSIBLE_MIME_TYPES = new HashSet<String>(Arrays.asList(
"application/xml",
"application/java",
"application/json",
"application/javascript",
"application/ecmascript"
));
/**
* Estimates if the provided artifact is compressible.
*
* @param artifact the artifact to be tested for compressibility
* @return A negative integer, a positive integer or zero depending on whether the artifact
* is estimated to be incompressible, compressible or if the estimate did not run.
*/
public static int isCompressible(@Nonnull Artifact artifact) {
if (SerializationType.GENERIC == artifact.getSerializationType()) {
/*
* Test for known content types
*/
String contentType = artifact.getContentType();
if (contentType != null) {
contentType = contentType.toLowerCase();
if (isCompressibleContentType(contentType)) {
return 1;
}
if (isIncompressibleContentType(contentType)) {
return -1;
}
}
/*
* Apply compressibility prediction heuristic on a sample of the artifact
*
* The heuristic is tested only if the expected cost of running the heuristic
* is smaller than 3% of the expected cost of compressing the artifact, such
* that the extra cost is reasonable in the worst case.
*
*
* The compression throughput is assumed to be 80 MB/s or less.
* The cost of peeking a sample and running the heuristic is expected to be 150μs or less.
*
* The artifact size threshold is thus set to 115KB.
*
* A better improved implementation may measure those values and self tune for a
* specific runtime.
*/
long contentLength = artifact.getContentLength();
if (contentLength > MIN_AUTO_DETECTION_LENGTH) {
return seemsCompressible(artifact);
}
}
return 0;
}
static boolean isCompressibleContentType(@Nonnull String mimeType) {
return mimeType.startsWith("text/") || COMPRESSIBLE_MIME_TYPES.contains(mimeType);
}
static boolean isIncompressibleContentType(@Nonnull String mimeType) {
return INCOMPRESSIBLE_MIME_TYPES.contains(mimeType);
}
static int seemsCompressible(@Nonnull Artifact artifact) {
try (InputStream stream = artifact.getInputStream()) {
byte[] sample = IOUtils.toByteArray(stream, SAMPLE_LENGTH);
return isCompressible(sample, SAMPLE_LENGTH) ? 1 : -1;
} catch (RepositoryException | IOException e) {
log.warn(e.getMessage(), e);
}
return 0;
}
/**
* This algorithm estimates the entropy of the high nibbles.
* <p>
* Credits to Thomas Mueller's for this solution, shared on StackOverflow at
* <a href="http://bit.ly/2mKsp0v">How To Efficiently Predict If Data Is Compressible</a>
*
* @param data the data to be tested for compressibility
* @param len the length of the data to be tested
* @return {@code true} if the data sample is compressible,
* {@code false} otherwise.
*/
private static boolean isCompressible(byte[] data, int len) {
// the number of bytes with
// high nibble 0, 1,.., 15
int[] sum = new int[16];
for (int i = 0; i < len; i++) {
int x = (data[i] & 255) >> 4;
sum[x]++;
}
// see wikipedia to understand this formula :-)
int r = 0;
for (int x : sum) {
long v = ((long) x << 32) / len;
r += 63 - Long.numberOfLeadingZeros(v + 1);
}
return len * r < 438 * len;
}
/**
* Check if writing a JarOutputStream supports switching the compression level for individual
* JarEntries. There are known issues with recent zlib versions and java which might result in broken
* packages being exported, when they contain already compressed binary entries according to CompressionUtil.
* <p/>
* for more information see:
* https://issues.apache.org/jira/browse/JCRVLT-257
* https://github.com/madler/zlib/issues/305
*
* @return {@code true} if the environment supports switching compression levels
*/
private static boolean checkEnvironmentSupportsCompressionSwitch() {
Throwable exception = null;
ByteArrayOutputStream byteArrayOut = new ByteArrayOutputStream();
byte[] nullBytes = new byte[1024 * 1024];
try (ZipOutputStream zipOut = new ZipOutputStream(byteArrayOut)) {
zipOut.setLevel(Deflater.BEST_SPEED);
zipOut.putNextEntry(new ZipEntry("deflated.bin"));
zipOut.write(nullBytes);
zipOut.closeEntry();
zipOut.putNextEntry(new ZipEntry("stored.bin"));
zipOut.setLevel(Deflater.BEST_COMPRESSION);
zipOut.write(nullBytes);
zipOut.closeEntry();
} catch (Throwable e) {
exception = e;
}
if (exception == null) {
try (ZipInputStream zipIn = new ZipInputStream(new ByteArrayInputStream(byteArrayOut.toByteArray()))) {
for (int i = 0; i < 2; i++) {
zipIn.getNextEntry();
while (zipIn.read(nullBytes) >= 0) {
}
}
} catch (Throwable e) {
exception = e;
}
}
if (exception != null) {
log.info("The current environment doesn't support switching compression level for individual JarEntries, see JCRVLT-257");
return false;
}
return true;
}
}