blob: 16b66798c234f9c083db91f6fe9558cd05c31b90 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.utils.bytecomparable;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import com.google.common.base.Preconditions;
import org.apache.cassandra.db.marshal.ValueAccessor;
/**
* Contains inverse transformation utilities for {@link ByteSource}s.
*
* See ByteComparable.md for details about the encoding scheme.
*/
public final class ByteSourceInverse
{
private static final int INITIAL_BUFFER_CAPACITY = 32;
private static final int BYTE_ALL_BITS = 0xFF;
private static final int BYTE_NO_BITS = 0x00;
private static final int BYTE_SIGN_BIT = 1 << 7;
private static final int SHORT_SIGN_BIT = 1 << 15;
private static final int INT_SIGN_BIT = 1 << 31;
private static final long LONG_SIGN_BIT = 1L << 63;
/**
* Consume the given number of bytes and produce a long from them, effectively treating the bytes as a big-endian
* unsigned encoding of the number.
*/
public static long getUnsignedFixedLengthAsLong(ByteSource byteSource, int length)
{
Preconditions.checkNotNull(byteSource);
Preconditions.checkArgument(length >= 1 && length <= 8, "Between 1 and 8 bytes can be read at a time");
long result = 0;
for (int i = 0; i < length; ++i)
result = (result << 8) | getAndCheckByte(byteSource, i, length); // note: this must use the unsigned byte value
return result;
}
/**
* Produce the bytes for an encoded signed fixed-length number.
* The first byte has its sign bit inverted, and the rest are passed unchanged.
*/
public static <V> V getSignedFixedLength(ValueAccessor<V> accessor, ByteSource byteSource, int length)
{
Preconditions.checkNotNull(byteSource);
Preconditions.checkArgument(length >= 1, "At least 1 byte should be read");
V result = accessor.allocate(length);
// The first byte needs to have its sign flipped
accessor.putByte(result, 0, (byte) (getAndCheckByte(byteSource, 0, length) ^ BYTE_SIGN_BIT));
// and the rest can be retrieved unchanged.
for (int i = 1; i < length; ++i)
accessor.putByte(result, i, (byte) getAndCheckByte(byteSource, i, length));
return result;
}
/**
* Produce the bytes for an encoded signed fixed-length number, also translating null to empty buffer.
* The first byte has its sign bit inverted, and the rest are passed unchanged.
*/
public static <V> V getOptionalSignedFixedLength(ValueAccessor<V> accessor, ByteSource byteSource, int length)
{
return byteSource == null ? accessor.empty() : getSignedFixedLength(accessor, byteSource, length);
}
/**
* Produce the bytes for an encoded signed fixed-length floating-point number.
* If sign bit is on, returns negated bytes. If not, clears the sign bit and passes the rest of the bytes unchanged.
*/
public static <V> V getSignedFixedLengthFloat(ValueAccessor<V> accessor, ByteSource byteSource, int length)
{
Preconditions.checkNotNull(byteSource);
Preconditions.checkArgument(length >= 1, "At least 1 byte should be read");
V result = accessor.allocate(length);
int xor;
int first = getAndCheckByte(byteSource, 0, length);
if (first < 0x80)
{
// Negative number. Invert all bits.
xor = BYTE_ALL_BITS;
first ^= xor;
}
else
{
// Positive number. Invert only the sign bit.
xor = BYTE_NO_BITS;
first ^= BYTE_SIGN_BIT;
}
accessor.putByte(result, 0, (byte) first);
// xor is now applied to the rest of the bytes to flip their bits if necessary.
for (int i = 1; i < length; ++i)
accessor.putByte(result, i, (byte) (getAndCheckByte(byteSource, i, length) ^ xor));
return result;
}
/**
* Produce the bytes for an encoded signed fixed-length floating-point number, also translating null to an empty
* buffer.
* If sign bit is on, returns negated bytes. If not, clears the sign bit and passes the rest of the bytes unchanged.
*/
public static <V> V getOptionalSignedFixedLengthFloat(ValueAccessor<V> accessor, ByteSource byteSource, int length)
{
return byteSource == null ? accessor.empty() : getSignedFixedLengthFloat(accessor, byteSource, length);
}
/**
* Consume the next length bytes from the source unchanged.
*/
public static <V> V getFixedLength(ValueAccessor<V> accessor, ByteSource byteSource, int length)
{
Preconditions.checkNotNull(byteSource);
Preconditions.checkArgument(length >= 1, "At least 1 byte should be read");
V result = accessor.allocate(length);
for (int i = 0; i < length; ++i)
accessor.putByte(result, i, (byte) getAndCheckByte(byteSource, i, length));
return result;
}
/**
* Consume the next length bytes from the source unchanged, also translating null to an empty buffer.
*/
public static <V> V getOptionalFixedLength(ValueAccessor<V> accessor, ByteSource byteSource, int length)
{
return byteSource == null ? accessor.empty() : getFixedLength(accessor, byteSource, length);
}
/**
* Consume the next {@code int} from the current position of the given {@link ByteSource}. The source position is
* modified accordingly (moved 4 bytes forward).
* <p>
* The source is not strictly required to represent just the encoding of an {@code int} value, so theoretically
* this API could be used for reading data in 4-byte strides. Nevertheless its usage is fairly limited because:
* <ol>
* <li>...it presupposes signed fixed-length encoding for the encoding of the original value</li>
* <li>...it decodes the data returned on each stride as an {@code int} (i.e. it inverts its leading bit)</li>
* <li>...it doesn't provide any meaningful guarantees (with regard to throwing) in case there are not enough
* bytes to read, in case a special escape value was not interpreted as such, etc.</li>
* </ol>
* </p>
*
* @param byteSource A non-null byte source, containing at least 4 bytes.
*/
public static int getSignedInt(ByteSource byteSource)
{
return (int) getUnsignedFixedLengthAsLong(byteSource, 4) ^ INT_SIGN_BIT;
}
/**
* Consume the next {@code long} from the current position of the given {@link ByteSource}. The source position is
* modified accordingly (moved 8 bytes forward).
* <p>
* The source is not strictly required to represent just the encoding of a {@code long} value, so theoretically
* this API could be used for reading data in 8-byte strides. Nevertheless its usage is fairly limited because:
* <ol>
* <li>...it presupposes signed fixed-length encoding for the encoding of the original value</li>
* <li>...it decodes the data returned on each stride as a {@code long} (i.e. it inverts its leading bit)</li>
* <li>...it doesn't provide any meaningful guarantees (with regard to throwing) in case there are not enough
* bytes to read, in case a special escape value was not interpreted as such, etc.</li>
* </ol>
* </p>
*
* @param byteSource A non-null byte source, containing at least 8 bytes.
*/
public static long getSignedLong(ByteSource byteSource)
{
return getUnsignedFixedLengthAsLong(byteSource, 8) ^ LONG_SIGN_BIT;
}
/**
* Converts the given {@link ByteSource} to a {@code byte}.
*
* @param byteSource A non-null byte source, containing at least 1 byte.
*/
public static byte getSignedByte(ByteSource byteSource)
{
return (byte) (getAndCheckByte(Preconditions.checkNotNull(byteSource), 0, 1) ^ BYTE_SIGN_BIT);
}
/**
* Converts the given {@link ByteSource} to a {@code short}. All terms and conditions valid for
* {@link #getSignedInt(ByteSource)} and {@link #getSignedLong(ByteSource)} translate to this as well.
*
* @param byteSource A non-null byte source, containing at least 2 bytes.
*
* @see #getSignedInt(ByteSource)
* @see #getSignedLong(ByteSource)
*/
public static short getSignedShort(ByteSource byteSource)
{
return (short) (getUnsignedFixedLengthAsLong(byteSource, 2) ^ SHORT_SIGN_BIT);
}
/**
* Decode a variable-length signed integer.
*/
public static long getVariableLengthInteger(ByteSource byteSource)
{
int signAndMask = getAndCheckByte(byteSource);
long sum = 0;
int bytes;
// For every bit after the sign that matches the sign, read one more byte.
for (bytes = 0; bytes < 7 && sameByteSign(signAndMask << (bytes + 1), signAndMask); ++bytes)
sum = (sum << 8) | getAndCheckByte(byteSource);
// The eighth length bit is stored in the second byte.
if (bytes == 7 && sameByteSign((int) (sum >> 48), signAndMask))
return ((sum << 8) | getAndCheckByte(byteSource)) ^ LONG_SIGN_BIT; // 9-byte encoding, use bytes 2-9 with inverted sign
else
{
sum |= (((long) signAndMask) << bytes * 8); // add the rest of the bits
long signMask = -0x40L << bytes * 7; // mask of the bits that should be replaced by the sign
long sign = (byte) (signAndMask ^ 0x80) >> 7; // -1 if negative (0 leading bit), 0 otherwise
return sum & ~signMask | sign & signMask;
}
}
/**
* Decode a variable-length unsigned integer, passing all bytes read through XOR with the given xorWith parameter.
*
* Used in BigInteger encoding to read number length, where negative numbers have their length negated
* (i.e. xorWith = 0xFF) to ensure correct ordering.
*/
public static long getVariableLengthUnsignedIntegerXoring(ByteSource byteSource, int xorWith)
{
int signAndMask = getAndCheckByte(byteSource) ^ xorWith;
long sum = 0;
int bytes;
// Read an extra byte while the next most significant bit is 1.
for (bytes = 0; bytes <= 7 && ((signAndMask << bytes) & 0x80) != 0; ++bytes)
sum = (sum << 8) | getAndCheckByte(byteSource) ^ xorWith;
// Strip the length bits from the leading byte.
signAndMask &= ~(-256 >> bytes);
return sum | (((long) signAndMask) << bytes * 8); // Add the rest of the bits of the leading byte.
}
/** Returns true if the two parameters treated as bytes have the same sign. */
private static boolean sameByteSign(int a, int b)
{
return ((a ^ b) & 0x80) == 0;
}
private static int getAndCheckByte(ByteSource byteSource)
{
return getAndCheckByte(byteSource, -1, -1);
}
private static int getAndCheckByte(ByteSource byteSource, int pos, int length)
{
int data = byteSource.next();
if (data == ByteSource.END_OF_STREAM)
throw new IllegalArgumentException(
length > 0 ? String.format("Unexpected end of stream reached after %d bytes (expected >= %d)", pos, length)
: "Unexpected end of stream");
assert data >= BYTE_NO_BITS && data <= BYTE_ALL_BITS
: "A ByteSource must produce unsigned bytes and end in END_OF_STREAM";
return data;
}
/**
* Reads a single variable-length byte sequence (blob, string, ...) encoded according to the scheme described
* in ByteComparable.md, decoding it back to its original, unescaped form.
*
* @param byteSource The source of the variable-length bytes sequence.
* @return A byte array containing the original, unescaped bytes of the given source. Unescaped here means
* not including any of the escape sequences of the encoding scheme used for variable-length byte sequences.
*/
public static byte[] getUnescapedBytes(ByteSource.Peekable byteSource)
{
return byteSource == null ? null : readBytes(unescape(byteSource));
}
/**
* As above, but converts the result to a ByteSource.
*/
public static ByteSource unescape(ByteSource.Peekable byteSource)
{
return new ByteSource() {
boolean escaped = false;
@Override
public int next()
{
if (!escaped)
{
int data = byteSource.next(); // we consume this byte no matter what it is
if (data > ByteSource.ESCAPE)
return data; // most used path leads here
assert data != ByteSource.END_OF_STREAM : "Invalid escaped byte sequence";
escaped = true;
}
int next = byteSource.peek();
switch (next)
{
case END_OF_STREAM:
// The end of a byte-comparable outside of a multi-component sequence. No matter what we have
// seen or peeked before, we should stop now.
byteSource.next();
return END_OF_STREAM;
case ESCAPED_0_DONE:
// The end of 1 or more consecutive 0x00 value bytes.
escaped = false;
byteSource.next();
return ESCAPE;
case ESCAPED_0_CONT:
// Escaped sequence continues
byteSource.next();
return ESCAPE;
default:
// An ESCAPE or ESCAPED_0_CONT won't be followed by either another ESCAPED_0_CONT, an
// ESCAPED_0_DONE, or an END_OF_STREAM only when the byte-comparable is part of a multi-component
// sequence and we have reached the end of the encoded byte-comparable. In this case, the byte
// we have just peeked is the separator or terminator byte between or at the end of components
// (which by contact must be 0x10 - 0xFE, which cannot conflict with our special bytes).
assert next >= ByteSource.MIN_SEPARATOR && next <= ByteSource.MAX_SEPARATOR : next;
// Unlike above, we don't consume this byte (the sequence decoding needs it).
return END_OF_STREAM;
}
}
};
}
/**
* Reads the bytes of the given source into a byte array. Doesn't do any transformation on the bytes, just reads
* them until it reads an {@link ByteSource#END_OF_STREAM} byte, after which it returns an array of all the read
* bytes, <strong>excluding the {@link ByteSource#END_OF_STREAM}</strong>.
* <p>
* This method sizes a tentative internal buffer array at {@code initialBufferCapacity}. However, if
* {@code byteSource} exceeds this size, the buffer array is recreated with doubled capacity as many times as
* necessary. If, after {@code byteSource} is fully exhausted, the number of bytes read from it does not exactly
* match the current size of the tentative buffer array, then it is copied into another array sized to fit the
* number of bytes read; otherwise, it is returned without that final copy step.
*
* @param byteSource The source which bytes we're interested in.
* @param initialBufferCapacity The initial size of the internal buffer.
* @return A byte array containing exactly all the read bytes. In case of a {@code null} source, the returned byte
* array will be empty.
*/
public static byte[] readBytes(ByteSource byteSource, final int initialBufferCapacity)
{
Preconditions.checkNotNull(byteSource);
int readBytes = 0;
byte[] buf = new byte[initialBufferCapacity];
int data;
while ((data = byteSource.next()) != ByteSource.END_OF_STREAM)
{
buf = ensureCapacity(buf, readBytes);
buf[readBytes++] = (byte) data;
}
if (readBytes != buf.length)
{
buf = Arrays.copyOf(buf, readBytes);
}
return buf;
}
/**
* Reads the bytes of the given source into a byte array. Doesn't do any transformation on the bytes, just reads
* them until it reads an {@link ByteSource#END_OF_STREAM} byte, after which it returns an array of all the read
* bytes, <strong>excluding the {@link ByteSource#END_OF_STREAM}</strong>.
* <p>
* This is equivalent to {@link #readBytes(ByteSource, int)} where the second actual parameter is
* {@linkplain #INITIAL_BUFFER_CAPACITY} ({@value INITIAL_BUFFER_CAPACITY}).
*
* @param byteSource The source which bytes we're interested in.
* @return A byte array containing exactly all the read bytes. In case of a {@code null} source, the returned byte
* array will be empty.
*/
public static byte[] readBytes(ByteSource byteSource)
{
return readBytes(byteSource, INITIAL_BUFFER_CAPACITY);
}
/**
* Ensures the given buffer has capacity for taking data with the given length - if it doesn't, it returns a copy
* of the buffer, but with double the capacity.
*/
private static byte[] ensureCapacity(byte[] buf, int dataLengthInBytes)
{
if (dataLengthInBytes == buf.length)
// We won't gain much with guarding against overflow. We'll overflow when dataLengthInBytes >= 1 << 30,
// and if we do guard, we'll be able to extend the capacity to Integer.MAX_VALUE (which is 1 << 31 - 1).
// Controlling the exception that will be thrown shouldn't matter that much, and in practice, we almost
// surely won't be reading gigabytes of ByteSource data at once.
return Arrays.copyOf(buf, dataLengthInBytes * 2);
else
return buf;
}
/**
* Converts the given {@link ByteSource} to a UTF-8 {@link String}.
*
* @param byteSource The source we're interested in.
* @return A UTF-8 string corresponding to the given source.
*/
public static String getString(ByteSource.Peekable byteSource)
{
if (byteSource == null)
return null;
byte[] data = getUnescapedBytes(byteSource);
return new String(data, StandardCharsets.UTF_8);
}
/*
* Multi-component sequence utilities.
*/
/**
* A utility for consuming components from a peekable multi-component sequence.
* It uses the component separators, so the given sequence needs to have its last component fully consumed, in
* order for the next consumable byte to be a separator. Identifying the end of the component that will then be
* consumed is the responsibility of the consumer (the user of this method).
* @param source A peekable multi-component sequence, which next byte is a component separator.
* @return the given multi-component sequence if its next component is not null, or {@code null} if it is.
*/
public static ByteSource.Peekable nextComponentSource(ByteSource.Peekable source)
{
return nextComponentSource(source, source.next());
}
/**
* A utility for consuming components from a peekable multi-component sequence, very similar to
* {@link #nextComponentSource(ByteSource.Peekable)} - the difference being that here the separator can be passed
* in case it had to be consumed beforehand.
*/
public static ByteSource.Peekable nextComponentSource(ByteSource.Peekable source, int separator)
{
return nextComponentNull(separator)
? null
: source;
}
public static boolean nextComponentNull(int separator)
{
return separator == ByteSource.NEXT_COMPONENT_NULL || separator == ByteSource.NEXT_COMPONENT_EMPTY
|| separator == ByteSource.NEXT_COMPONENT_EMPTY_REVERSED;
}
}