blob: 207a972b4cafc9022413d8fd2ec1ea31679d16a2 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.security.MessageDigest;
import net.nicoulaj.compilecommand.annotations.Inline;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.ColumnDefinition;
import org.apache.cassandra.cql3.ColumnIdentifier;
import org.apache.cassandra.db.composites.*;
import org.apache.cassandra.db.filter.ColumnSlice;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.db.marshal.CompositeType;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.FastByteOperations;
import org.apache.cassandra.utils.concurrent.OpOrder;
import org.apache.cassandra.utils.memory.*;
/**
* <pre>
* {@code
* Packs a CellName AND a Cell into one off-heap representation.
* Layout is:
*
* Note we store the ColumnIdentifier in full as bytes. This seems an okay tradeoff for now, as we just
* look it back up again when we need to, and in the near future we hope to switch to ints, longs or
* UUIDs representing column identifiers on disk, at which point we can switch that here as well.
*
* [timestamp][value offset][name size]][name extra][name offset deltas][cell names][value][Descendants]
* [ 8b ][ 4b ][ 2b ][ 1b ][ each 2b ][ arb < 64k][ arb ][ arbitrary ]
*
* descendants: any overriding classes will put their state here
* name offsets are deltas from their base offset, and don't include the first offset, or the end position of the final entry,
* i.e. there will be size - 1 entries, and each is a delta that is added to the offset of the position of the first name
* (which is always CELL_NAME_OFFSETS_OFFSET + (2 * (size - 1))). The length of the final name fills up any remaining
* space upto the value offset
* name extra: lowest 2 bits indicate the clustering size delta (i.e. how many name items are NOT part of the clustering key)
* the next 2 bits indicate the CellNameType
* the next bit indicates if the column is a static or clustered/dynamic column
* }
* </pre>
*/
public abstract class AbstractNativeCell extends AbstractCell implements CellName
{
static final int TIMESTAMP_OFFSET = 4;
private static final int VALUE_OFFSET_OFFSET = 12;
private static final int CELL_NAME_SIZE_OFFSET = 16;
private static final int CELL_NAME_EXTRA_OFFSET = 18;
private static final int CELL_NAME_OFFSETS_OFFSET = 19;
private static final int CELL_NAME_SIZE_DELTA_MASK = 3;
private static final int CELL_NAME_TYPE_SHIFT = 2;
private static final int CELL_NAME_TYPE_MASK = 7;
private static enum NameType
{
COMPOUND_DENSE(0 << 2), COMPOUND_SPARSE(1 << 2), COMPOUND_SPARSE_STATIC(2 << 2), SIMPLE_DENSE(3 << 2), SIMPLE_SPARSE(4 << 2);
static final NameType[] TYPES = NameType.values();
final int bits;
NameType(int bits)
{
this.bits = bits;
}
static NameType typeOf(CellName name)
{
if (name instanceof CompoundDenseCellName)
{
assert !name.isStatic();
return COMPOUND_DENSE;
}
if (name instanceof CompoundSparseCellName)
return name.isStatic() ? COMPOUND_SPARSE_STATIC : COMPOUND_SPARSE;
if (name instanceof SimpleDenseCellName)
{
assert !name.isStatic();
return SIMPLE_DENSE;
}
if (name instanceof SimpleSparseCellName)
{
assert !name.isStatic();
return SIMPLE_SPARSE;
}
if (name instanceof NativeCell)
return ((NativeCell) name).nametype();
throw new AssertionError();
}
}
private final long peer; // peer is assigned by peer updater in setPeer method
AbstractNativeCell()
{
peer = -1;
}
public AbstractNativeCell(NativeAllocator allocator, OpOrder.Group writeOp, Cell copyOf)
{
int size = sizeOf(copyOf);
peer = allocator.allocate(size, writeOp);
MemoryUtil.setInt(peer, size);
construct(copyOf);
}
protected int sizeOf(Cell cell)
{
int size = CELL_NAME_OFFSETS_OFFSET + Math.max(0, cell.name().size() - 1) * 2 + cell.value().remaining();
CellName name = cell.name();
for (int i = 0; i < name.size(); i++)
size += name.get(i).remaining();
return size;
}
protected void construct(Cell from)
{
setLong(TIMESTAMP_OFFSET, from.timestamp());
CellName name = from.name();
int nameSize = name.size();
int offset = CELL_NAME_SIZE_OFFSET;
setShort(offset, (short) nameSize);
assert nameSize - name.clusteringSize() <= 2;
byte cellNameExtraBits = (byte) ((nameSize - name.clusteringSize()) | NameType.typeOf(name).bits);
setByte(offset += 2, cellNameExtraBits);
offset += 1;
short cellNameDelta = 0;
for (int i = 1; i < nameSize; i++)
{
cellNameDelta += name.get(i - 1).remaining();
setShort(offset, cellNameDelta);
offset += 2;
}
for (int i = 0; i < nameSize; i++)
{
ByteBuffer bb = name.get(i);
setBytes(offset, bb);
offset += bb.remaining();
}
setInt(VALUE_OFFSET_OFFSET, offset);
setBytes(offset, from.value());
}
// the offset at which to read the short that gives the names
private int nameDeltaOffset(int i)
{
return CELL_NAME_OFFSETS_OFFSET + ((i - 1) * 2);
}
int valueStartOffset()
{
return getInt(VALUE_OFFSET_OFFSET);
}
private int valueEndOffset()
{
return (int) (internalSize() - postfixSize());
}
protected int postfixSize()
{
return 0;
}
@Override
public ByteBuffer value()
{
long offset = valueStartOffset();
return getByteBuffer(offset, (int) (internalSize() - (postfixSize() + offset))).order(ByteOrder.BIG_ENDIAN);
}
private int clusteringSizeDelta()
{
return getByte(CELL_NAME_EXTRA_OFFSET) & CELL_NAME_SIZE_DELTA_MASK;
}
public boolean isStatic()
{
return nametype() == NameType.COMPOUND_SPARSE_STATIC;
}
NameType nametype()
{
return NameType.TYPES[(((int) this.getByte(CELL_NAME_EXTRA_OFFSET)) >> CELL_NAME_TYPE_SHIFT) & CELL_NAME_TYPE_MASK];
}
public long minTimestamp()
{
return timestamp();
}
public long maxTimestamp()
{
return timestamp();
}
public int clusteringSize()
{
return size() - clusteringSizeDelta();
}
@Override
public ColumnIdentifier cql3ColumnName(CFMetaData metadata)
{
switch (nametype())
{
case SIMPLE_SPARSE:
return getIdentifier(metadata, get(clusteringSize()));
case COMPOUND_SPARSE_STATIC:
case COMPOUND_SPARSE:
ByteBuffer buffer = get(clusteringSize());
if (buffer.remaining() == 0)
return CompoundSparseCellNameType.rowMarkerId;
return getIdentifier(metadata, buffer);
case SIMPLE_DENSE:
case COMPOUND_DENSE:
return null;
default:
throw new AssertionError();
}
}
public ByteBuffer collectionElement()
{
return isCollectionCell() ? get(size() - 1) : null;
}
// we always have a collection element if our clustering size is 2 less than our total size,
// and we never have one otherwiss
public boolean isCollectionCell()
{
return clusteringSizeDelta() == 2;
}
public boolean isSameCQL3RowAs(CellNameType type, CellName other)
{
switch (nametype())
{
case SIMPLE_DENSE:
case COMPOUND_DENSE:
return type.compare(this, other) == 0;
case COMPOUND_SPARSE_STATIC:
case COMPOUND_SPARSE:
int clusteringSize = clusteringSize();
if (clusteringSize != other.clusteringSize() || other.isStatic() != isStatic())
return false;
for (int i = 0; i < clusteringSize; i++)
if (type.subtype(i).compare(get(i), other.get(i)) != 0)
return false;
return true;
case SIMPLE_SPARSE:
return true;
default:
throw new AssertionError();
}
}
public int size()
{
return getShort(CELL_NAME_SIZE_OFFSET);
}
public boolean isEmpty()
{
return size() == 0;
}
public ByteBuffer get(int i)
{
return get(i, null);
}
private ByteBuffer get(int i, AbstractAllocator copy)
{
// remember to take dense/sparse into account, and only return EOC when not dense
int size = size();
assert i >= 0 && i < size();
int cellNamesOffset = nameDeltaOffset(size);
int startDelta = i == 0 ? 0 : getShort(nameDeltaOffset(i));
int endDelta = i < size - 1 ? getShort(nameDeltaOffset(i + 1)) : valueStartOffset() - cellNamesOffset;
int length = endDelta - startDelta;
if (copy == null)
return getByteBuffer(cellNamesOffset + startDelta, length).order(ByteOrder.BIG_ENDIAN);
ByteBuffer result = copy.allocate(length);
FastByteOperations.UnsafeOperations.copy(null, peer + cellNamesOffset + startDelta, result, 0, length);
return result;
}
private static final ThreadLocal<byte[]> BUFFER = new ThreadLocal<byte[]>()
{
protected byte[] initialValue()
{
return new byte[256];
}
};
protected void writeComponentTo(MessageDigest digest, int i, boolean includeSize)
{
// remember to take dense/sparse into account, and only return EOC when not dense
int size = size();
assert i >= 0 && i < size();
int cellNamesOffset = nameDeltaOffset(size);
int startDelta = i == 0 ? 0 : getShort(nameDeltaOffset(i));
int endDelta = i < size - 1 ? getShort(nameDeltaOffset(i + 1)) : valueStartOffset() - cellNamesOffset;
int componentStart = cellNamesOffset + startDelta;
int count = endDelta - startDelta;
if (includeSize)
FBUtilities.updateWithShort(digest, count);
writeMemoryTo(digest, componentStart, count);
}
protected void writeMemoryTo(MessageDigest digest, int from, int count)
{
// only batch if we have more than 16 bytes remaining to transfer, otherwise fall-back to single-byte updates
int i = 0, batchEnd = count - 16;
if (i < batchEnd)
{
byte[] buffer = BUFFER.get();
while (i < batchEnd)
{
int transfer = Math.min(count - i, 256);
getBytes(from + i, buffer, 0, transfer);
digest.update(buffer, 0, transfer);
i += transfer;
}
}
while (i < count)
digest.update(getByte(from + i++));
}
public EOC eoc()
{
return EOC.NONE;
}
public Composite withEOC(EOC eoc)
{
throw new UnsupportedOperationException();
}
public Composite start()
{
throw new UnsupportedOperationException();
}
public Composite end()
{
throw new UnsupportedOperationException();
}
public ColumnSlice slice()
{
throw new UnsupportedOperationException();
}
public boolean isPrefixOf(CType type, Composite c)
{
if (size() > c.size() || isStatic() != c.isStatic())
return false;
for (int i = 0; i < size(); i++)
{
if (type.subtype(i).compare(get(i), c.get(i)) != 0)
return false;
}
return true;
}
public ByteBuffer toByteBuffer()
{
// for simple sparse we just return our one name buffer
switch (nametype())
{
case SIMPLE_DENSE:
case SIMPLE_SPARSE:
return get(0);
case COMPOUND_DENSE:
case COMPOUND_SPARSE_STATIC:
case COMPOUND_SPARSE:
// This is the legacy format of composites.
// See org.apache.cassandra.db.marshal.CompositeType for details.
ByteBuffer result = ByteBuffer.allocate(cellDataSize());
if (isStatic())
ByteBufferUtil.writeShortLength(result, CompositeType.STATIC_MARKER);
for (int i = 0; i < size(); i++)
{
ByteBuffer bb = get(i);
ByteBufferUtil.writeShortLength(result, bb.remaining());
result.put(bb);
result.put((byte) 0);
}
result.flip();
return result;
default:
throw new AssertionError();
}
}
protected void updateWithName(MessageDigest digest)
{
// for simple sparse we just return our one name buffer
switch (nametype())
{
case SIMPLE_DENSE:
case SIMPLE_SPARSE:
writeComponentTo(digest, 0, false);
break;
case COMPOUND_DENSE:
case COMPOUND_SPARSE_STATIC:
case COMPOUND_SPARSE:
// This is the legacy format of composites.
// See org.apache.cassandra.db.marshal.CompositeType for details.
if (isStatic())
FBUtilities.updateWithShort(digest, CompositeType.STATIC_MARKER);
for (int i = 0; i < size(); i++)
{
writeComponentTo(digest, i, true);
digest.update((byte) 0);
}
break;
default:
throw new AssertionError();
}
}
protected void updateWithValue(MessageDigest digest)
{
int offset = valueStartOffset();
int length = valueEndOffset() - offset;
writeMemoryTo(digest, offset, length);
}
@Override // this is the NAME dataSize, only!
public int dataSize()
{
switch (nametype())
{
case SIMPLE_DENSE:
case SIMPLE_SPARSE:
return valueStartOffset() - nameDeltaOffset(size());
case COMPOUND_DENSE:
case COMPOUND_SPARSE_STATIC:
case COMPOUND_SPARSE:
int size = size();
return valueStartOffset() - nameDeltaOffset(size) + 3 * size + (isStatic() ? 2 : 0);
default:
throw new AssertionError();
}
}
public boolean equals(Object obj)
{
if (obj == this)
return true;
if (obj instanceof CellName)
return equals((CellName) obj);
if (obj instanceof Cell)
return equals((Cell) obj);
return false;
}
public boolean equals(CellName that)
{
int size = this.size();
if (size != that.size())
return false;
for (int i = 0 ; i < size ; i++)
if (!get(i).equals(that.get(i)))
return false;
return true;
}
private static final ByteBuffer[] EMPTY = new ByteBuffer[0];
@Override
public CellName copy(CFMetaData cfm, AbstractAllocator allocator)
{
ByteBuffer[] r;
switch (nametype())
{
case SIMPLE_DENSE:
return CellNames.simpleDense(get(0, allocator));
case COMPOUND_DENSE:
r = new ByteBuffer[size()];
for (int i = 0; i < r.length; i++)
r[i] = get(i, allocator);
return CellNames.compositeDense(r);
case COMPOUND_SPARSE_STATIC:
case COMPOUND_SPARSE:
int clusteringSize = clusteringSize();
r = clusteringSize == 0 ? EMPTY : new ByteBuffer[clusteringSize()];
for (int i = 0; i < clusteringSize; i++)
r[i] = get(i, allocator);
ByteBuffer nameBuffer = get(r.length);
ColumnIdentifier name;
if (nameBuffer.remaining() == 0)
{
name = CompoundSparseCellNameType.rowMarkerId;
}
else
{
name = getIdentifier(cfm, nameBuffer);
}
if (clusteringSizeDelta() == 2)
{
ByteBuffer element = allocator.clone(get(size() - 1));
return CellNames.compositeSparseWithCollection(r, element, name, isStatic());
}
return CellNames.compositeSparse(r, name, isStatic());
case SIMPLE_SPARSE:
return CellNames.simpleSparse(getIdentifier(cfm, get(0)));
}
throw new IllegalStateException();
}
private static ColumnIdentifier getIdentifier(CFMetaData cfMetaData, ByteBuffer name)
{
ColumnDefinition def = cfMetaData.getColumnDefinition(name);
if (def != null)
{
return def.name;
}
else
{
// it's safe to simply grab based on clusteringPrefixSize() as we are only called if not a dense type
AbstractType<?> type = cfMetaData.comparator.subtype(cfMetaData.comparator.clusteringPrefixSize());
return new ColumnIdentifier(HeapAllocator.instance.clone(name), type);
}
}
@Override
public Cell withUpdatedName(CellName newName)
{
throw new UnsupportedOperationException();
}
@Override
public Cell withUpdatedTimestamp(long newTimestamp)
{
throw new UnsupportedOperationException();
}
protected long internalSize()
{
return MemoryUtil.getInt(peer);
}
private void checkPosition(long offset, long size)
{
assert size >= 0;
assert peer > 0 : "Memory was freed";
assert offset >= 0 && offset + size <= internalSize() : String.format("Illegal range: [%d..%d), size: %s", offset, offset + size, internalSize());
}
protected final void setByte(long offset, byte b)
{
checkPosition(offset, 1);
MemoryUtil.setByte(peer + offset, b);
}
protected final void setShort(long offset, short s)
{
checkPosition(offset, 1);
MemoryUtil.setShort(peer + offset, s);
}
protected final void setInt(long offset, int l)
{
checkPosition(offset, 4);
MemoryUtil.setInt(peer + offset, l);
}
protected final void setLong(long offset, long l)
{
checkPosition(offset, 8);
MemoryUtil.setLong(peer + offset, l);
}
protected final void setBytes(long offset, ByteBuffer buffer)
{
int start = buffer.position();
int count = buffer.limit() - start;
if (count == 0)
return;
checkPosition(offset, count);
MemoryUtil.setBytes(peer + offset, buffer);
}
protected final byte getByte(long offset)
{
checkPosition(offset, 1);
return MemoryUtil.getByte(peer + offset);
}
protected final void getBytes(long offset, byte[] trg, int trgOffset, int count)
{
checkPosition(offset, count);
MemoryUtil.getBytes(peer + offset, trg, trgOffset, count);
}
protected final int getShort(long offset)
{
checkPosition(offset, 2);
return MemoryUtil.getShort(peer + offset);
}
protected final int getInt(long offset)
{
checkPosition(offset, 4);
return MemoryUtil.getInt(peer + offset);
}
protected final long getLong(long offset)
{
checkPosition(offset, 8);
return MemoryUtil.getLong(peer + offset);
}
protected final ByteBuffer getByteBuffer(long offset, int length)
{
checkPosition(offset, length);
return MemoryUtil.getByteBuffer(peer + offset, length);
}
// requires isByteOrderComparable to be true. Compares the name components only; ; may need to compare EOC etc still
@Inline
public final int compareTo(final Composite that)
{
if (isStatic() != that.isStatic())
{
// Static sorts before non-static no matter what, except for empty which
// always sort first
if (isEmpty())
return that.isEmpty() ? 0 : -1;
if (that.isEmpty())
return 1;
return isStatic() ? -1 : 1;
}
int size = size();
int size2 = that.size();
int minSize = Math.min(size, size2);
int startDelta = 0;
int cellNamesOffset = nameDeltaOffset(size);
for (int i = 0 ; i < minSize ; i++)
{
int endDelta = i < size - 1 ? getShort(nameDeltaOffset(i + 1)) : valueStartOffset() - cellNamesOffset;
long offset = peer + cellNamesOffset + startDelta;
int length = endDelta - startDelta;
int cmp = FastByteOperations.UnsafeOperations.compareTo(null, offset, length, that.get(i));
if (cmp != 0)
return cmp;
startDelta = endDelta;
}
EOC eoc = that.eoc();
if (size == size2)
return this.eoc().compareTo(eoc);
return size < size2 ? this.eoc().prefixComparisonResult : -eoc.prefixComparisonResult;
}
public final int compareToSimple(final Composite that)
{
assert size() == 1 && that.size() == 1;
int length = valueStartOffset() - nameDeltaOffset(1);
long offset = peer + nameDeltaOffset(1);
return FastByteOperations.UnsafeOperations.compareTo(null, offset, length, that.get(0));
}
}