lucene/core/src/java/org/apache/lucene/util/compress/LowercaseAsciiCompression.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.util.compress;

 import java.io.IOException;

 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.BytesRef;

 /**
  * Utility class that can efficiently compress arrays that mostly contain
  * characters in the [0x1F,0x3F) or [0x5F,0x7F) ranges, which notably
  * include all digits, lowercase characters, '.', '-' and '_'.
  */
 public final class LowercaseAsciiCompression {

   private static final boolean isCompressible(int b) {
     final int high3Bits = (b + 1) & ~0x1F;
     return high3Bits == 0x20 || high3Bits == 0x60;
   }

   private LowercaseAsciiCompression() {}

   /**
    * Compress {@code in[0:len]} into {@code out}.
    * This returns {@code false} if the content cannot be compressed. The number
    * of bytes written is guaranteed to be less than {@code len} otherwise.
    */
   public static boolean compress(byte[] in, int len, byte[] tmp, DataOutput out) throws IOException {
     if (len < 8) {
       return false;
     }

     // 1. Count exceptions and fail compression if there are too many of them.
     final int maxExceptions = len >>> 5;
     int previousExceptionIndex = 0;
     int numExceptions = 0;
     for (int i = 0; i < len; ++i) {
       final int b = in[i] & 0xFF;
       if (isCompressible(b) == false) {
         while (i - previousExceptionIndex > 0xFF) {
           ++numExceptions;
           previousExceptionIndex += 0xFF;
         }
         if (++numExceptions > maxExceptions) {
           return false;
         }
         previousExceptionIndex = i;
       }
     }
     assert numExceptions <= maxExceptions;

     // 2. Now move all bytes to the [0,0x40) range (6 bits). This loop gets auto-vectorized on JDK13+.
     final int compressedLen = len - (len >>> 2); // ignores exceptions
     assert compressedLen < len;
     for (int i = 0; i < len; ++i) {
       int b = (in[i] & 0xFF) + 1;
       tmp[i] = (byte) ((b & 0x1F) | ((b & 0x40) >>> 1));
     }

     // 3. Now pack the bytes so that we record 4 ASCII chars in 3 bytes
     int o = 0;
     for (int i = compressedLen; i < len; ++i) {
       tmp[o++] |= (tmp[i] & 0x30) << 2; // bits 4-5
     }
     for (int i = compressedLen; i < len; ++i) {
       tmp[o++] |= (tmp[i] & 0x0C) << 4; // bits 2-3
     }
     for (int i = compressedLen; i < len; ++i) {
       tmp[o++] |= (tmp[i] & 0x03) << 6; // bits 0-1
     }
     assert o <= compressedLen;

     out.writeBytes(tmp, 0, compressedLen);

     // 4. Finally record exceptions
     out.writeVInt(numExceptions);
     if (numExceptions > 0) {
       previousExceptionIndex = 0;
       int numExceptions2 = 0;
       for (int i = 0; i < len; ++i) {
         int b = in[i] & 0xFF;
         if (isCompressible(b) == false) {
           while (i - previousExceptionIndex > 0xFF) {
             // We record deltas between exceptions as bytes, so we need to create
             // "artificial" exceptions if the delta between two of them is greater
             // than the maximum unsigned byte value.
             out.writeByte((byte) 0xFF);
             previousExceptionIndex += 0xFF;
             out.writeByte(in[previousExceptionIndex]);
             numExceptions2++;
           }
           out.writeByte((byte) (i - previousExceptionIndex));
           previousExceptionIndex = i;
           out.writeByte((byte) b);
           numExceptions2++;
         }
       }
       if (numExceptions != numExceptions2) {
         throw new IllegalStateException("" + numExceptions + " <> " + numExceptions2 + " " + new BytesRef(in, 0, len).utf8ToString());
       }
     }

     return true;
   }

   /**
    * Decompress data that has been compressed with {@link #compress(byte[], int, byte[], DataOutput)}.
    * {@code len} must be the original length, not the compressed length.
    */
   public static void decompress(DataInput in, byte[] out, int len) throws IOException {
     final int saved = len >>> 2;
     int compressedLen = len - saved;

     // 1. Copy the packed bytes
     in.readBytes(out, 0, compressedLen);

     // 2. Restore the leading 2 bits of each packed byte into whole bytes
     for (int i = 0; i < saved; ++i) {
       out[compressedLen + i] = (byte) (((out[i] & 0xC0) >>> 2) | ((out[saved + i] & 0xC0) >>> 4) | ((out[(saved<<1) + i] & 0xC0) >>> 6));
     }

     // 3. Move back to the original range. This loop gets auto-vectorized on JDK13+.
     for (int i = 0; i < len; ++i) {
       final byte b = out[i];
       out[i] = (byte) (((b & 0x1F) | 0x20 | ((b & 0x20) << 1)) - 1);
     }

     // 4. Restore exceptions
     final int numExceptions = in.readVInt();
     int i = 0;
     for (int exception = 0; exception < numExceptions; ++exception) {
       i += in.readByte() & 0xFF;
       out[i] = in.readByte();
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.util.compress;

	import java.io.IOException;

	import org.apache.lucene.store.DataInput;
	import org.apache.lucene.store.DataOutput;
	import org.apache.lucene.util.BytesRef;

	/**
	* Utility class that can efficiently compress arrays that mostly contain
	* characters in the [0x1F,0x3F) or [0x5F,0x7F) ranges, which notably
	* include all digits, lowercase characters, '.', '-' and '_'.
	*/
	public final class LowercaseAsciiCompression {

	private static final boolean isCompressible(int b) {
	final int high3Bits = (b + 1) & ~0x1F;
	return high3Bits == 0x20 \|\| high3Bits == 0x60;
	}

	private LowercaseAsciiCompression() {}

	/**
	* Compress {@code in[0:len]} into {@code out}.
	* This returns {@code false} if the content cannot be compressed. The number
	* of bytes written is guaranteed to be less than {@code len} otherwise.
	*/
	public static boolean compress(byte[] in, int len, byte[] tmp, DataOutput out) throws IOException {
	if (len < 8) {
	return false;
	}

	// 1. Count exceptions and fail compression if there are too many of them.
	final int maxExceptions = len >>> 5;
	int previousExceptionIndex = 0;
	int numExceptions = 0;
	for (int i = 0; i < len; ++i) {
	final int b = in[i] & 0xFF;
	if (isCompressible(b) == false) {
	while (i - previousExceptionIndex > 0xFF) {
	++numExceptions;
	previousExceptionIndex += 0xFF;
	}
	if (++numExceptions > maxExceptions) {
	return false;
	}
	previousExceptionIndex = i;
	}
	}
	assert numExceptions <= maxExceptions;

	// 2. Now move all bytes to the [0,0x40) range (6 bits). This loop gets auto-vectorized on JDK13+.
	final int compressedLen = len - (len >>> 2); // ignores exceptions
	assert compressedLen < len;
	for (int i = 0; i < len; ++i) {
	int b = (in[i] & 0xFF) + 1;
	tmp[i] = (byte) ((b & 0x1F) \| ((b & 0x40) >>> 1));
	}

	// 3. Now pack the bytes so that we record 4 ASCII chars in 3 bytes
	int o = 0;
	for (int i = compressedLen; i < len; ++i) {
	tmp[o++] \|= (tmp[i] & 0x30) << 2; // bits 4-5
	}
	for (int i = compressedLen; i < len; ++i) {
	tmp[o++] \|= (tmp[i] & 0x0C) << 4; // bits 2-3
	}
	for (int i = compressedLen; i < len; ++i) {
	tmp[o++] \|= (tmp[i] & 0x03) << 6; // bits 0-1
	}
	assert o <= compressedLen;

	out.writeBytes(tmp, 0, compressedLen);

	// 4. Finally record exceptions
	out.writeVInt(numExceptions);
	if (numExceptions > 0) {
	previousExceptionIndex = 0;
	int numExceptions2 = 0;
	for (int i = 0; i < len; ++i) {
	int b = in[i] & 0xFF;
	if (isCompressible(b) == false) {
	while (i - previousExceptionIndex > 0xFF) {
	// We record deltas between exceptions as bytes, so we need to create
	// "artificial" exceptions if the delta between two of them is greater
	// than the maximum unsigned byte value.
	out.writeByte((byte) 0xFF);
	previousExceptionIndex += 0xFF;
	out.writeByte(in[previousExceptionIndex]);
	numExceptions2++;
	}
	out.writeByte((byte) (i - previousExceptionIndex));
	previousExceptionIndex = i;
	out.writeByte((byte) b);
	numExceptions2++;
	}
	}
	if (numExceptions != numExceptions2) {
	throw new IllegalStateException("" + numExceptions + " <> " + numExceptions2 + " " + new BytesRef(in, 0, len).utf8ToString());
	}
	}

	return true;
	}

	/**
	* Decompress data that has been compressed with {@link #compress(byte[], int, byte[], DataOutput)}.
	* {@code len} must be the original length, not the compressed length.
	*/
	public static void decompress(DataInput in, byte[] out, int len) throws IOException {
	final int saved = len >>> 2;
	int compressedLen = len - saved;

	// 1. Copy the packed bytes
	in.readBytes(out, 0, compressedLen);

	// 2. Restore the leading 2 bits of each packed byte into whole bytes
	for (int i = 0; i < saved; ++i) {
	out[compressedLen + i] = (byte) (((out[i] & 0xC0) >>> 2) \| ((out[saved + i] & 0xC0) >>> 4) \| ((out[(saved<<1) + i] & 0xC0) >>> 6));
	}

	// 3. Move back to the original range. This loop gets auto-vectorized on JDK13+.
	for (int i = 0; i < len; ++i) {
	final byte b = out[i];
	out[i] = (byte) (((b & 0x1F) \| 0x20 \| ((b & 0x20) << 1)) - 1);
	}

	// 4. Restore exceptions
	final int numExceptions = in.readVInt();
	int i = 0;
	for (int exception = 0; exception < numExceptions; ++exception) {
	i += in.readByte() & 0xFF;
	out[i] = in.readByte();
	}
	}
	}