| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.cassandra.db; |
| |
| import java.io.DataInput; |
| import java.io.IOException; |
| import java.io.IOError; |
| import java.nio.ByteBuffer; |
| import java.security.MessageDigest; |
| import java.util.*; |
| import java.util.concurrent.TimeUnit; |
| import java.util.stream.Collectors; |
| |
| import org.apache.cassandra.cql3.ColumnIdentifier; |
| import org.apache.cassandra.cql3.SuperColumnCompatibility; |
| import org.apache.cassandra.utils.AbstractIterator; |
| |
| import com.google.common.annotations.VisibleForTesting; |
| import com.google.common.collect.Iterators; |
| import com.google.common.collect.Lists; |
| import com.google.common.collect.PeekingIterator; |
| |
| import org.apache.cassandra.config.CFMetaData; |
| import org.apache.cassandra.config.ColumnDefinition; |
| import org.apache.cassandra.db.filter.ColumnFilter; |
| import org.apache.cassandra.db.filter.DataLimits; |
| import org.apache.cassandra.db.rows.*; |
| import org.apache.cassandra.db.partitions.*; |
| import org.apache.cassandra.db.context.CounterContext; |
| import org.apache.cassandra.db.marshal.*; |
| import org.apache.cassandra.io.util.DataInputPlus; |
| import org.apache.cassandra.io.util.DataOutputPlus; |
| import org.apache.cassandra.net.MessagingService; |
| import org.apache.cassandra.utils.*; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import static com.google.common.collect.Iterables.all; |
| import static org.apache.cassandra.utils.ByteBufferUtil.bytes; |
| |
| /** |
| * Functions to deal with the old format. |
| */ |
| public abstract class LegacyLayout |
| { |
| private static final Logger logger = LoggerFactory.getLogger(LegacyLayout.class); |
| private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 1L, TimeUnit.MINUTES); |
| |
| public final static int MAX_CELL_NAME_LENGTH = FBUtilities.MAX_UNSIGNED_SHORT; |
| |
| public final static int STATIC_PREFIX = 0xFFFF; |
| |
| public final static int DELETION_MASK = 0x01; |
| public final static int EXPIRATION_MASK = 0x02; |
| public final static int COUNTER_MASK = 0x04; |
| public final static int COUNTER_UPDATE_MASK = 0x08; |
| private final static int RANGE_TOMBSTONE_MASK = 0x10; |
| |
| // Used in decodeBound if the number of components in the legacy bound is greater than the clustering size, |
| // indicating a complex column deletion (i.e. a collection tombstone), but the referenced column is either |
| // not present in the current table metadata, or is not currently a complex column. In that case, we'll |
| // check the dropped columns for the table which should contain the previous column definition. If that |
| // previous definition is also not complex (indicating that the column may have been dropped and re-added |
| // with different types multiple times), we use this fake definition to ensure that the complex deletion |
| // can be safely processed. This resulting deletion should be filtered out of any row created by a |
| // CellGrouper by the dropped column check, but this gives us an extra level of confidence as that check |
| // is timestamp based and so is fallible in the face of clock drift. |
| private static final ColumnDefinition INVALID_DROPPED_COMPLEX_SUBSTITUTE_COLUMN = |
| new ColumnDefinition("", |
| "", |
| ColumnIdentifier.getInterned(ByteBufferUtil.EMPTY_BYTE_BUFFER, UTF8Type.instance), |
| SetType.getInstance(UTF8Type.instance, true), |
| ColumnDefinition.NO_POSITION, |
| ColumnDefinition.Kind.REGULAR); |
| |
| private LegacyLayout() {} |
| |
| public static AbstractType<?> makeLegacyComparator(CFMetaData metadata) |
| { |
| ClusteringComparator comparator = metadata.comparator; |
| if (!metadata.isCompound()) |
| { |
| assert comparator.size() == 1; |
| return comparator.subtype(0); |
| } |
| |
| boolean hasCollections = metadata.hasCollectionColumns() || metadata.hasDroppedCollectionColumns(); |
| List<AbstractType<?>> types = new ArrayList<>(comparator.size() + (metadata.isDense() ? 0 : 1) + (hasCollections ? 1 : 0)); |
| |
| types.addAll(comparator.subtypes()); |
| |
| if (!metadata.isDense()) |
| { |
| types.add(UTF8Type.instance); |
| |
| if (hasCollections) |
| { |
| Map<ByteBuffer, CollectionType> defined = new HashMap<>(); |
| |
| for (CFMetaData.DroppedColumn def : metadata.getDroppedColumns().values()) |
| if (def.type instanceof CollectionType && def.type.isMultiCell()) |
| defined.put(bytes(def.name), (CollectionType) def.type); |
| |
| for (ColumnDefinition def : metadata.partitionColumns()) |
| if (def.type instanceof CollectionType && def.type.isMultiCell()) |
| defined.put(def.name.bytes, (CollectionType) def.type); |
| |
| types.add(ColumnToCollectionType.getInstance(defined)); |
| } |
| } |
| return CompositeType.getInstance(types); |
| } |
| |
| public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer cellname) |
| throws UnknownColumnException |
| { |
| assert cellname != null; |
| if (metadata.isSuper()) |
| { |
| assert superColumnName != null; |
| return decodeForSuperColumn(metadata, new Clustering(superColumnName), cellname); |
| } |
| |
| assert superColumnName == null; |
| return decodeCellName(metadata, cellname); |
| } |
| |
| private static LegacyCellName decodeForSuperColumn(CFMetaData metadata, Clustering clustering, ByteBuffer subcol) |
| { |
| ColumnDefinition def = metadata.getColumnDefinition(subcol); |
| if (def != null) |
| { |
| // it's a statically defined subcolumn |
| return new LegacyCellName(clustering, def, null); |
| } |
| |
| def = metadata.compactValueColumn(); |
| assert def != null && def.type instanceof MapType; |
| return new LegacyCellName(clustering, def, subcol); |
| } |
| |
| public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer cellname) throws UnknownColumnException |
| { |
| return decodeCellName(metadata, cellname, false); |
| } |
| |
| public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer cellname, boolean readAllAsDynamic) throws UnknownColumnException |
| { |
| Clustering clustering = decodeClustering(metadata, cellname); |
| |
| if (metadata.isSuper()) |
| return decodeForSuperColumn(metadata, clustering, CompositeType.extractComponent(cellname, 1)); |
| |
| if (metadata.isDense() || (metadata.isCompactTable() && readAllAsDynamic)) |
| return new LegacyCellName(clustering, metadata.compactValueColumn(), null); |
| |
| ByteBuffer column = metadata.isCompound() ? CompositeType.extractComponent(cellname, metadata.comparator.size()) : cellname; |
| if (column == null) |
| { |
| // Tables for composite 2ndary indexes used to be compound but dense, but we've transformed them into regular tables |
| // (non compact ones) but with no regular column (i.e. we only care about the clustering). So we'll get here |
| // in that case, and what we want to return is basically a row marker. |
| if (metadata.partitionColumns().isEmpty()) |
| return new LegacyCellName(clustering, null, null); |
| |
| // Otherwise, we shouldn't get there |
| throw new IllegalArgumentException("No column name component found in cell name"); |
| } |
| |
| // Row marker, this is ok |
| if (!column.hasRemaining()) |
| return new LegacyCellName(clustering, null, null); |
| |
| ColumnDefinition def = metadata.getColumnDefinition(column); |
| |
| if (metadata.isCompactTable()) |
| { |
| if (def == null || def.isPrimaryKeyColumn()) |
| // If it's a compact table, it means the column is in fact a "dynamic" one |
| return new LegacyCellName(new Clustering(column), metadata.compactValueColumn(), null); |
| } |
| else if (def == null) |
| { |
| throw new UnknownColumnException(metadata, column); |
| } |
| |
| ByteBuffer collectionElement = metadata.isCompound() ? CompositeType.extractComponent(cellname, metadata.comparator.size() + 1) : null; |
| if (collectionElement != null && def.type instanceof CollectionType) |
| { |
| ((CollectionType)def.type).nameComparator().validateIfFixedSize(collectionElement); |
| } |
| |
| // Note that because static compact columns are translated to static defs in the new world order, we need to force a static |
| // clustering if the definition is static (as it might not be in this case). |
| return new LegacyCellName(def.isStatic() ? Clustering.STATIC_CLUSTERING : clustering, def, collectionElement); |
| } |
| |
| public static LegacyBound decodeSliceBound(CFMetaData metadata, ByteBuffer bound, boolean isStart) |
| { |
| return decodeBound(metadata, bound, isStart, false); |
| } |
| |
| public static LegacyBound decodeTombstoneBound(CFMetaData metadata, ByteBuffer bound, boolean isStart) |
| { |
| return decodeBound(metadata, bound, isStart, true); |
| } |
| |
| private static LegacyBound decodeBound(CFMetaData metadata, ByteBuffer bound, boolean isStart, boolean isDeletion) |
| { |
| if (!bound.hasRemaining()) |
| return isStart ? LegacyBound.BOTTOM : LegacyBound.TOP; |
| |
| if (!metadata.isCompound()) |
| { |
| // The non compound case is a lot easier, in that there is no EOC nor collection to worry about, so dealing |
| // with that first. |
| metadata.comparator.subtype(0).validateIfFixedSize(bound); |
| return new LegacyBound(isStart ? Slice.Bound.inclusiveStartOf(bound) : Slice.Bound.inclusiveEndOf(bound), false, null); |
| } |
| |
| int clusteringSize = metadata.comparator.size(); |
| |
| boolean isStatic = metadata.isCompound() && CompositeType.isStaticName(bound); |
| List<ByteBuffer> components = CompositeType.splitName(bound); |
| byte eoc = CompositeType.lastEOC(bound); |
| for (int i=0; i<Math.min(clusteringSize, components.size()); i++) |
| { |
| metadata.comparator.subtype(i).validateIfFixedSize(components.get(i)); |
| } |
| |
| // if the bound we have decoded is static, 2.2 format requires there to be N empty clusterings |
| assert !isStatic || |
| (components.size() >= clusteringSize |
| && all(components.subList(0, clusteringSize), ByteBufferUtil.EMPTY_BYTE_BUFFER::equals)); |
| |
| ColumnDefinition collectionName = null; |
| if (components.size() > clusteringSize) |
| { |
| // For a deletion, there can be more components than the clustering size only in the case this is the |
| // bound of a collection range tombstone. In such a case, there is exactly one more component, and that |
| // component is the name of the collection being deleted, since we do not support collection range deletions. |
| // If the bound is not part of a deletion, it is from slice query filter. The column name may be: |
| // - a valid, non-collection column; in this case we expect a single extra component |
| // - an empty buffer, representing a row marker; in this case we also expect a single extra empty component |
| // - a valid collection column and the first part of a cell path; in this case we expect exactly two extra components |
| // In any of these slice cases, these items are unnecessary for the bound we construct, |
| // so we can simply remove them, after corroborating we have encountered one of these scenario. |
| assert !metadata.isCompactTable() : toDebugHex(components); |
| |
| // In all cases, the element straight after the clusterings should contain the name of a column. |
| if (components.size() > clusteringSize + 1) |
| { |
| // we accept bounds from paging state that occur inside a complex column - in this case, we expect |
| // two excess components, the first of which is a column name, the second a key into the collection |
| if (isDeletion) |
| throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": deletion can have at most one extra component"); |
| |
| if (clusteringSize + 2 != components.size()) |
| throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": complex slices require exactly two extra components"); |
| |
| // decode simply to verify that we have (or may have had) a complex column; we assume the collection key is valid |
| decodeBoundLookupComplexColumn(metadata, components, clusteringSize, isStatic); |
| components.remove(clusteringSize + 1); |
| } |
| else if (isDeletion) |
| { |
| collectionName = decodeBoundLookupComplexColumn(metadata, components, clusteringSize, isStatic); |
| } |
| else if (components.get(clusteringSize).hasRemaining()) |
| { |
| decodeBoundVerifySimpleColumn(metadata, components, clusteringSize, isStatic); |
| } |
| components.remove(clusteringSize); |
| } |
| |
| boolean isInclusive; |
| if (isStart) |
| { |
| isInclusive = eoc <= 0; |
| } |
| else |
| { |
| isInclusive = eoc >= 0; |
| |
| // for an end bound, if we only have a prefix of all the components and the final EOC is zero, |
| // then it should only match up to the prefix but no further, that is, it is an inclusive bound |
| // of the exact prefix but an exclusive bound of anything beyond it, so adding an empty |
| // composite value ensures this behavior, see CASSANDRA-12423 for more details |
| if (eoc == 0 && components.size() < clusteringSize) |
| { |
| components.add(ByteBufferUtil.EMPTY_BYTE_BUFFER); |
| isInclusive = false; |
| } |
| } |
| |
| Slice.Bound.Kind boundKind = Slice.Bound.boundKind(isStart, isInclusive); |
| Slice.Bound sb = Slice.Bound.create(boundKind, components.toArray(new ByteBuffer[components.size()])); |
| return new LegacyBound(sb, isStatic, collectionName); |
| } |
| |
| // finds the simple column definition associated with components.get(clusteringSize) |
| // if no such columns exists, or ever existed, we throw an exception; if we do not know, we return a dummy column definition |
| private static ColumnDefinition decodeBoundLookupComplexColumn(CFMetaData metadata, List<ByteBuffer> components, int clusteringSize, boolean isStatic) |
| { |
| ByteBuffer columnNameBytes = components.get(clusteringSize); |
| ColumnDefinition columnName = metadata.getColumnDefinition(columnNameBytes); |
| if (columnName == null || !columnName.isComplex()) |
| { |
| columnName = metadata.getDroppedColumnDefinition(columnNameBytes, isStatic); |
| // if no record of the column having ever existed is found, something is badly wrong |
| if (columnName == null) |
| throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": expected complex column at position " + clusteringSize); |
| |
| // if we do have a record of dropping this column but it wasn't previously complex, use a fake |
| // column definition for safety (see the comment on the constant declaration for details) |
| if (!columnName.isComplex()) |
| columnName = INVALID_DROPPED_COMPLEX_SUBSTITUTE_COLUMN; |
| } |
| |
| return columnName; |
| } |
| |
| // finds the simple column definition associated with components.get(clusteringSize) |
| // if no such columns exists, and definitely never existed, we throw an exception |
| private static void decodeBoundVerifySimpleColumn(CFMetaData metadata, List<ByteBuffer> components, int clusteringSize, boolean isStatic) |
| { |
| ByteBuffer columnNameBytes = components.get(clusteringSize); |
| ColumnDefinition columnName = metadata.getColumnDefinition(columnNameBytes); |
| if (columnName == null || !columnName.isSimple()) |
| { |
| columnName = metadata.getDroppedColumnDefinition(columnNameBytes, isStatic); |
| // if no record of the column having ever existed is found, something is badly wrong |
| if (columnName == null) |
| throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": expected simple column at position " + clusteringSize); |
| } |
| } |
| |
| private static String toDebugHex(Collection<ByteBuffer> buffers) |
| { |
| return buffers.stream().map(ByteBufferUtil::bytesToHex).collect(Collectors.joining()); |
| } |
| |
| public static ByteBuffer encodeBound(CFMetaData metadata, Slice.Bound bound, boolean isStart) |
| { |
| if (bound == Slice.Bound.BOTTOM || bound == Slice.Bound.TOP || metadata.comparator.size() == 0) |
| return ByteBufferUtil.EMPTY_BYTE_BUFFER; |
| |
| ClusteringPrefix clustering = bound.clustering(); |
| |
| if (!metadata.isCompound()) |
| { |
| assert clustering.size() == 1; |
| return clustering.get(0); |
| } |
| |
| CompositeType ctype = CompositeType.getInstance(metadata.comparator.subtypes()); |
| CompositeType.Builder builder = ctype.builder(); |
| for (int i = 0; i < clustering.size(); i++) |
| builder.add(clustering.get(i)); |
| |
| if (isStart) |
| return bound.isInclusive() ? builder.build() : builder.buildAsEndOfRange(); |
| else |
| return bound.isInclusive() ? builder.buildAsEndOfRange() : builder.build(); |
| } |
| |
| public static ByteBuffer encodeCellName(CFMetaData metadata, ClusteringPrefix clustering, ByteBuffer columnName, ByteBuffer collectionElement) |
| { |
| boolean isStatic = clustering == Clustering.STATIC_CLUSTERING; |
| |
| if (!metadata.isCompound()) |
| { |
| if (isStatic) |
| return columnName; |
| |
| assert clustering.size() == 1 : "Expected clustering size to be 1, but was " + clustering.size(); |
| return clustering.get(0); |
| } |
| |
| // We use comparator.size() rather than clustering.size() because of static clusterings |
| int clusteringSize = metadata.comparator.size(); |
| int size = clusteringSize + (metadata.isDense() ? 0 : 1) + (collectionElement == null ? 0 : 1); |
| if (metadata.isSuper()) |
| size = clusteringSize + 1; |
| ByteBuffer[] values = new ByteBuffer[size]; |
| for (int i = 0; i < clusteringSize; i++) |
| { |
| if (isStatic) |
| { |
| values[i] = ByteBufferUtil.EMPTY_BYTE_BUFFER; |
| continue; |
| } |
| |
| ByteBuffer v = clustering.get(i); |
| // we can have null (only for dense compound tables for backward compatibility reasons) but that |
| // means we're done and should stop there as far as building the composite is concerned. |
| if (v == null) |
| return CompositeType.build(Arrays.copyOfRange(values, 0, i)); |
| |
| values[i] = v; |
| } |
| |
| if (metadata.isSuper()) |
| { |
| // We need to set the "column" (in thrift terms) name, i.e. the value corresponding to the subcomparator. |
| // What it is depends if this a cell for a declared "static" column or a "dynamic" column part of the |
| // super-column internal map. |
| assert columnName != null; // This should never be null for supercolumns, see decodeForSuperColumn() above |
| values[clusteringSize] = columnName.equals(SuperColumnCompatibility.SUPER_COLUMN_MAP_COLUMN) |
| ? collectionElement |
| : columnName; |
| } |
| else |
| { |
| if (!metadata.isDense()) |
| values[clusteringSize] = columnName; |
| if (collectionElement != null) |
| values[clusteringSize + 1] = collectionElement; |
| } |
| |
| return CompositeType.build(isStatic, values); |
| } |
| |
| public static Clustering decodeClustering(CFMetaData metadata, ByteBuffer value) |
| { |
| int csize = metadata.comparator.size(); |
| if (csize == 0) |
| return Clustering.EMPTY; |
| |
| if (metadata.isCompound() && CompositeType.isStaticName(value)) |
| return Clustering.STATIC_CLUSTERING; |
| |
| List<ByteBuffer> components = metadata.isCompound() |
| ? CompositeType.splitName(value) |
| : Collections.singletonList(value); |
| |
| for (int i=0; i<Math.min(csize, components.size()); i++) |
| { |
| AbstractType<?> type = metadata.comparator.subtype(i); |
| type.validateIfFixedSize(components.get(i)); |
| } |
| return new Clustering(components.subList(0, Math.min(csize, components.size())).toArray(new ByteBuffer[csize])); |
| } |
| |
| public static ByteBuffer encodeClustering(CFMetaData metadata, ClusteringPrefix clustering) |
| { |
| if (clustering.size() == 0) |
| return ByteBufferUtil.EMPTY_BYTE_BUFFER; |
| |
| if (!metadata.isCompound()) |
| { |
| assert clustering.size() == 1; |
| return clustering.get(0); |
| } |
| |
| ByteBuffer[] values = new ByteBuffer[clustering.size()]; |
| for (int i = 0; i < clustering.size(); i++) |
| values[i] = clustering.get(i); |
| return CompositeType.build(values); |
| } |
| |
| /** |
| * The maximum number of cells to include per partition when converting to the old format. |
| * <p> |
| * We already apply the limit during the actual query, but for queries that counts cells and not rows (thrift queries |
| * and distinct queries as far as old nodes are concerned), we may still include a little bit more than requested |
| * because {@link DataLimits} always include full rows. So if the limit ends in the middle of a queried row, the |
| * full row will be part of our result. This would confuse old nodes however so we make sure to truncate it to |
| * what's expected before writting it on the wire. |
| * |
| * @param command the read commmand for which to determine the maximum cells per partition. This can be {@code null} |
| * in which case {@code Integer.MAX_VALUE} is returned. |
| * @return the maximum number of cells per partition that should be enforced according to the read command if |
| * post-query limitation are in order (see above). This will be {@code Integer.MAX_VALUE} if no such limits are |
| * necessary. |
| */ |
| private static int maxLiveCellsPerPartition(ReadCommand command) |
| { |
| if (command == null) |
| return Integer.MAX_VALUE; |
| |
| DataLimits limits = command.limits(); |
| |
| // There is 2 types of DISTINCT queries: those that includes only the partition key, and those that include static columns. |
| // On old nodes, the latter expects the first row in term of CQL count, which is what we already have and there is no additional |
| // limit to apply. The former however expect only one cell per partition and rely on it (See CASSANDRA-10762). |
| if (limits.isDistinct()) |
| return command.columnFilter().fetchedColumns().statics.isEmpty() ? 1 : Integer.MAX_VALUE; |
| |
| switch (limits.kind()) |
| { |
| case THRIFT_LIMIT: |
| case SUPER_COLUMN_COUNTING_LIMIT: |
| return limits.perPartitionCount(); |
| default: |
| return Integer.MAX_VALUE; |
| } |
| } |
| |
| // For serializing to old wire format |
| public static LegacyUnfilteredPartition fromUnfilteredRowIterator(ReadCommand command, UnfilteredRowIterator iterator) |
| { |
| // we need to extract the range tombstone so materialize the partition. Since this is |
| // used for the on-wire format, this is not worst than it used to be. |
| final ImmutableBTreePartition partition = ImmutableBTreePartition.create(iterator); |
| DeletionInfo info = partition.deletionInfo(); |
| Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> pair = fromRowIterator(partition.metadata(), partition.iterator(), partition.staticRow()); |
| |
| LegacyLayout.LegacyRangeTombstoneList rtl = pair.left; |
| |
| // Processing the cell iterator results in the LegacyRangeTombstoneList being populated, so we do this |
| // before we use the LegacyRangeTombstoneList at all |
| List<LegacyLayout.LegacyCell> cells = Lists.newArrayList(pair.right); |
| |
| int maxCellsPerPartition = maxLiveCellsPerPartition(command); |
| cells = maybeTrimLiveCells(cells, maxCellsPerPartition, command); |
| |
| // The LegacyRangeTombstoneList already has range tombstones for the single-row deletions and complex |
| // deletions. Go through our normal range tombstones and add then to the LegacyRTL so that the range |
| // tombstones all get merged and sorted properly. |
| if (info.hasRanges()) |
| { |
| Iterator<RangeTombstone> rangeTombstoneIterator = info.rangeIterator(false); |
| while (rangeTombstoneIterator.hasNext()) |
| { |
| RangeTombstone rt = rangeTombstoneIterator.next(); |
| Slice slice = rt.deletedSlice(); |
| LegacyLayout.LegacyBound start = new LegacyLayout.LegacyBound(slice.start(), false, null); |
| LegacyLayout.LegacyBound end = new LegacyLayout.LegacyBound(slice.end(), false, null); |
| rtl.add(start, end, rt.deletionTime().markedForDeleteAt(), rt.deletionTime().localDeletionTime()); |
| } |
| } |
| |
| return new LegacyUnfilteredPartition(info.getPartitionDeletion(), rtl, cells); |
| } |
| |
| private static List<LegacyCell> maybeTrimLiveCells(List<LegacyCell> cells, int maxLiveCells, ReadCommand command) |
| { |
| if (null == command || maxLiveCells >= cells.size()) |
| return cells; |
| |
| int nowInSec = command.nowInSec(); |
| int live = 0; |
| int dead = 0; |
| |
| for (int i = 0; i < cells.size() && live < maxLiveCells; i++) |
| { |
| if (cells.get(i).isLive(nowInSec)) |
| live++; |
| else |
| dead++; |
| } |
| |
| return cells.subList(0, live + dead); |
| } |
| |
| public static void serializeAsLegacyPartition(ReadCommand command, UnfilteredRowIterator partition, DataOutputPlus out, int version) throws IOException |
| { |
| assert version < MessagingService.VERSION_30; |
| |
| out.writeBoolean(true); |
| |
| LegacyLayout.LegacyUnfilteredPartition legacyPartition = LegacyLayout.fromUnfilteredRowIterator(command, partition); |
| |
| UUIDSerializer.serializer.serialize(partition.metadata().cfId, out, version); |
| DeletionTime.serializer.serialize(legacyPartition.partitionDeletion, out); |
| |
| legacyPartition.rangeTombstones.serialize(out, partition.metadata()); |
| |
| // begin cell serialization |
| out.writeInt(legacyPartition.cells.size()); |
| for (LegacyLayout.LegacyCell cell : legacyPartition.cells) |
| { |
| ByteBufferUtil.writeWithShortLength(cell.name.encode(partition.metadata()), out); |
| out.writeByte(cell.serializationFlags()); |
| if (cell.isExpiring()) |
| { |
| out.writeInt(cell.ttl); |
| out.writeInt(cell.localDeletionTime); |
| } |
| else if (cell.isTombstone()) |
| { |
| out.writeLong(cell.timestamp); |
| out.writeInt(TypeSizes.sizeof(cell.localDeletionTime)); |
| out.writeInt(cell.localDeletionTime); |
| continue; |
| } |
| else if (cell.isCounterUpdate()) |
| { |
| out.writeLong(cell.timestamp); |
| long count = CounterContext.instance().getUpdateCount(cell.value); |
| ByteBufferUtil.writeWithLength(ByteBufferUtil.bytes(count), out); |
| continue; |
| } |
| else if (cell.isCounter()) |
| { |
| out.writeLong(Long.MIN_VALUE); // timestampOfLastDelete (not used, and MIN_VALUE is the default) |
| } |
| |
| out.writeLong(cell.timestamp); |
| ByteBufferUtil.writeWithLength(cell.value, out); |
| } |
| } |
| |
| // For the old wire format |
| // Note: this can return null if an empty partition is serialized! |
| public static UnfilteredRowIterator deserializeLegacyPartition(DataInputPlus in, int version, SerializationHelper.Flag flag, ByteBuffer key) throws IOException |
| { |
| assert version < MessagingService.VERSION_30; |
| |
| // This is only used in mutation, and mutation have never allowed "null" column families |
| boolean present = in.readBoolean(); |
| if (!present) |
| return null; |
| |
| CFMetaData metadata = CFMetaData.serializer.deserialize(in, version); |
| LegacyDeletionInfo info = LegacyDeletionInfo.deserialize(metadata, in); |
| int size = in.readInt(); |
| Iterator<LegacyCell> cells = deserializeCells(metadata, in, flag, size); |
| SerializationHelper helper = new SerializationHelper(metadata, version, flag); |
| return onWireCellstoUnfilteredRowIterator(metadata, metadata.partitioner.decorateKey(key), info, cells, false, helper); |
| } |
| |
| // For the old wire format |
| public static long serializedSizeAsLegacyPartition(ReadCommand command, UnfilteredRowIterator partition, int version) |
| { |
| assert version < MessagingService.VERSION_30; |
| |
| if (partition.isEmpty()) |
| return TypeSizes.sizeof(false); |
| |
| long size = TypeSizes.sizeof(true); |
| |
| LegacyLayout.LegacyUnfilteredPartition legacyPartition = LegacyLayout.fromUnfilteredRowIterator(command, partition); |
| |
| size += UUIDSerializer.serializer.serializedSize(partition.metadata().cfId, version); |
| size += DeletionTime.serializer.serializedSize(legacyPartition.partitionDeletion); |
| size += legacyPartition.rangeTombstones.serializedSize(partition.metadata()); |
| |
| // begin cell serialization |
| size += TypeSizes.sizeof(legacyPartition.cells.size()); |
| for (LegacyLayout.LegacyCell cell : legacyPartition.cells) |
| { |
| size += ByteBufferUtil.serializedSizeWithShortLength(cell.name.encode(partition.metadata())); |
| size += 1; // serialization flags |
| if (cell.isExpiring()) |
| { |
| size += TypeSizes.sizeof(cell.ttl); |
| size += TypeSizes.sizeof(cell.localDeletionTime); |
| } |
| else if (cell.isTombstone()) |
| { |
| size += TypeSizes.sizeof(cell.timestamp); |
| // localDeletionTime replaces cell.value as the body |
| size += TypeSizes.sizeof(TypeSizes.sizeof(cell.localDeletionTime)); |
| size += TypeSizes.sizeof(cell.localDeletionTime); |
| continue; |
| } |
| else if (cell.isCounterUpdate()) |
| { |
| size += TypeSizes.sizeof(cell.timestamp); |
| long count = CounterContext.instance().getUpdateCount(cell.value); |
| size += ByteBufferUtil.serializedSizeWithLength(ByteBufferUtil.bytes(count)); |
| continue; |
| } |
| else if (cell.isCounter()) |
| { |
| size += TypeSizes.sizeof(Long.MIN_VALUE); // timestampOfLastDelete |
| } |
| |
| size += TypeSizes.sizeof(cell.timestamp); |
| size += ByteBufferUtil.serializedSizeWithLength(cell.value); |
| } |
| |
| return size; |
| } |
| |
| // For thrift sake |
| public static UnfilteredRowIterator toUnfilteredRowIterator(CFMetaData metadata, |
| DecoratedKey key, |
| LegacyDeletionInfo delInfo, |
| Iterator<LegacyCell> cells) |
| { |
| SerializationHelper helper = new SerializationHelper(metadata, 0, SerializationHelper.Flag.LOCAL); |
| return toUnfilteredRowIterator(metadata, key, delInfo, cells, false, helper); |
| } |
| |
| // For deserializing old wire format |
| public static UnfilteredRowIterator onWireCellstoUnfilteredRowIterator(CFMetaData metadata, |
| DecoratedKey key, |
| LegacyDeletionInfo delInfo, |
| Iterator<LegacyCell> cells, |
| boolean reversed, |
| SerializationHelper helper) |
| { |
| |
| // If the table is a static compact, the "column_metadata" are now internally encoded as |
| // static. This has already been recognized by decodeCellName, but it means the cells |
| // provided are not in the expected order (the "static" cells are not necessarily at the front). |
| // So sort them to make sure toUnfilteredRowIterator works as expected. |
| // Further, if the query is reversed, then the on-wire format still has cells in non-reversed |
| // order, but we need to have them reverse in the final UnfilteredRowIterator. So reverse them. |
| if (metadata.isStaticCompactTable() || reversed) |
| { |
| List<LegacyCell> l = new ArrayList<>(); |
| Iterators.addAll(l, cells); |
| Collections.sort(l, legacyCellComparator(metadata, reversed)); |
| cells = l.iterator(); |
| } |
| |
| return toUnfilteredRowIterator(metadata, key, delInfo, cells, reversed, helper); |
| } |
| |
| private static UnfilteredRowIterator toUnfilteredRowIterator(CFMetaData metadata, |
| DecoratedKey key, |
| LegacyDeletionInfo delInfo, |
| Iterator<LegacyCell> cells, |
| boolean reversed, |
| SerializationHelper helper) |
| { |
| // A reducer that basically does nothing, we know the 2 merged iterators can't have conflicting atoms (since we merge cells with range tombstones). |
| MergeIterator.Reducer<LegacyAtom, LegacyAtom> reducer = new MergeIterator.Reducer<LegacyAtom, LegacyAtom>() |
| { |
| private LegacyAtom atom; |
| |
| public void reduce(int idx, LegacyAtom current) |
| { |
| // We're merging cell with range tombstones, so we should always only have a single atom to reduce. |
| assert atom == null; |
| atom = current; |
| } |
| |
| protected LegacyAtom getReduced() |
| { |
| return atom; |
| } |
| |
| protected void onKeyChange() |
| { |
| atom = null; |
| } |
| }; |
| List<Iterator<LegacyAtom>> iterators = Arrays.asList(asLegacyAtomIterator(cells), asLegacyAtomIterator(delInfo.inRowRangeTombstones())); |
| PeekingIterator<LegacyAtom> atoms = Iterators.peekingIterator(MergeIterator.get(iterators, legacyAtomComparator(metadata), reducer)); |
| |
| // Check if we have some static |
| Row staticRow = atoms.hasNext() && atoms.peek().isStatic() |
| ? getNextRow(CellGrouper.staticGrouper(metadata, helper), atoms) |
| : Rows.EMPTY_STATIC_ROW; |
| |
| Iterator<Row> rows = convertToRows(new CellGrouper(metadata, helper), atoms); |
| Iterator<RangeTombstone> ranges = delInfo.deletionInfo.rangeIterator(reversed); |
| return new RowAndDeletionMergeIterator(metadata, |
| key, |
| delInfo.deletionInfo.getPartitionDeletion(), |
| ColumnFilter.all(metadata), |
| staticRow, |
| reversed, |
| EncodingStats.NO_STATS, |
| rows, |
| ranges, |
| true); |
| } |
| |
| public static Row extractStaticColumns(CFMetaData metadata, DataInputPlus in, Columns statics) throws IOException |
| { |
| assert !statics.isEmpty(); |
| assert metadata.isCompactTable(); |
| |
| if (metadata.isSuper()) |
| // TODO: there is in practice nothing to do here, but we need to handle the column_metadata for super columns somewhere else |
| throw new UnsupportedOperationException(); |
| |
| Set<ByteBuffer> columnsToFetch = new HashSet<>(statics.size()); |
| for (ColumnDefinition column : statics) |
| columnsToFetch.add(column.name.bytes); |
| |
| Row.Builder builder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds()); |
| builder.newRow(Clustering.STATIC_CLUSTERING); |
| |
| boolean foundOne = false; |
| LegacyAtom atom; |
| while ((atom = readLegacyAtomSkippingUnknownColumn(metadata,in)) != null) |
| { |
| if (atom.isCell()) |
| { |
| LegacyCell cell = atom.asCell(); |
| if (!columnsToFetch.contains(cell.name.encode(metadata))) |
| continue; |
| |
| foundOne = true; |
| cell.name.column.type.validateIfFixedSize(cell.value); |
| builder.addCell(new BufferCell(cell.name.column, cell.timestamp, cell.ttl, cell.localDeletionTime, cell.value, null)); |
| } |
| else |
| { |
| LegacyRangeTombstone tombstone = atom.asRangeTombstone(); |
| // TODO: we need to track tombstones and potentially ignore cells that are |
| // shadowed (or even better, replace them by tombstones). |
| throw new UnsupportedOperationException(); |
| } |
| } |
| |
| return foundOne ? builder.build() : Rows.EMPTY_STATIC_ROW; |
| } |
| |
| private static LegacyAtom readLegacyAtomSkippingUnknownColumn(CFMetaData metadata, DataInputPlus in) |
| throws IOException |
| { |
| while (true) |
| { |
| try |
| { |
| return readLegacyAtom(metadata, in, false); |
| } |
| catch (UnknownColumnException e) |
| { |
| // Simply skip, as the method name implies. |
| } |
| } |
| |
| } |
| |
| private static Row getNextRow(CellGrouper grouper, PeekingIterator<? extends LegacyAtom> cells) |
| { |
| if (!cells.hasNext()) |
| return null; |
| |
| grouper.reset(); |
| while (cells.hasNext() && grouper.addAtom(cells.peek())) |
| { |
| // We've added the cell already in the grouper, so just skip it |
| cells.next(); |
| } |
| return grouper.getRow(); |
| } |
| |
| @SuppressWarnings("unchecked") |
| private static Iterator<LegacyAtom> asLegacyAtomIterator(Iterator<? extends LegacyAtom> iter) |
| { |
| return (Iterator<LegacyAtom>)iter; |
| } |
| |
| private static Iterator<Row> convertToRows(final CellGrouper grouper, final PeekingIterator<LegacyAtom> atoms) |
| { |
| return new AbstractIterator<Row>() |
| { |
| protected Row computeNext() |
| { |
| if (!atoms.hasNext()) |
| return endOfData(); |
| |
| return getNextRow(grouper, atoms); |
| } |
| }; |
| } |
| |
| public static Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> fromRowIterator(final RowIterator iterator) |
| { |
| return fromRowIterator(iterator.metadata(), iterator, iterator.staticRow()); |
| } |
| |
| private static Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> fromRowIterator(final CFMetaData metadata, final Iterator<Row> iterator, final Row staticRow) |
| { |
| LegacyRangeTombstoneList deletions = new LegacyRangeTombstoneList(new LegacyBoundComparator(metadata.comparator), 10); |
| Iterator<LegacyCell> cells = new AbstractIterator<LegacyCell>() |
| { |
| private Iterator<LegacyCell> currentRow = initializeRow(); |
| |
| private Iterator<LegacyCell> initializeRow() |
| { |
| if (staticRow == null || staticRow.isEmpty()) |
| return Collections.<LegacyLayout.LegacyCell>emptyIterator(); |
| |
| Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> row = fromRow(metadata, staticRow); |
| deletions.addAll(row.left); |
| return row.right; |
| } |
| |
| protected LegacyCell computeNext() |
| { |
| while (true) |
| { |
| if (currentRow.hasNext()) |
| return currentRow.next(); |
| |
| if (!iterator.hasNext()) |
| return endOfData(); |
| |
| Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> row = fromRow(metadata, iterator.next()); |
| deletions.addAll(row.left); |
| currentRow = row.right; |
| } |
| } |
| }; |
| |
| return Pair.create(deletions, cells); |
| } |
| |
| private static Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> fromRow(final CFMetaData metadata, final Row row) |
| { |
| // convert any complex deletions or row deletion into normal range tombstones so that we can build and send a proper RangeTombstoneList |
| // to legacy nodes |
| LegacyRangeTombstoneList deletions = new LegacyRangeTombstoneList(new LegacyBoundComparator(metadata.comparator), 10); |
| |
| if (!row.deletion().isLive()) |
| { |
| Clustering clustering = row.clustering(); |
| Slice.Bound startBound = Slice.Bound.inclusiveStartOf(clustering); |
| Slice.Bound endBound = Slice.Bound.inclusiveEndOf(clustering); |
| |
| LegacyBound start = new LegacyLayout.LegacyBound(startBound, false, null); |
| LegacyBound end = new LegacyLayout.LegacyBound(endBound, false, null); |
| |
| deletions.add(start, end, row.deletion().time().markedForDeleteAt(), row.deletion().time().localDeletionTime()); |
| } |
| |
| for (ColumnData cd : row) |
| { |
| ColumnDefinition col = cd.column(); |
| if (col.isSimple()) |
| continue; |
| |
| DeletionTime delTime = ((ComplexColumnData)cd).complexDeletion(); |
| if (!delTime.isLive()) |
| { |
| Clustering clustering = row.clustering(); |
| boolean isStatic = clustering == Clustering.STATIC_CLUSTERING; |
| assert isStatic == col.isStatic(); |
| |
| Slice.Bound startBound = isStatic |
| ? LegacyDeletionInfo.staticBound(metadata, true) |
| : Slice.Bound.inclusiveStartOf(clustering); |
| Slice.Bound endBound = isStatic |
| ? LegacyDeletionInfo.staticBound(metadata, false) |
| : Slice.Bound.inclusiveEndOf(clustering); |
| |
| LegacyLayout.LegacyBound start = new LegacyLayout.LegacyBound(startBound, isStatic, col); |
| LegacyLayout.LegacyBound end = new LegacyLayout.LegacyBound(endBound, isStatic, col); |
| |
| deletions.add(start, end, delTime.markedForDeleteAt(), delTime.localDeletionTime()); |
| } |
| } |
| |
| Iterator<LegacyCell> cells = new AbstractIterator<LegacyCell>() |
| { |
| private final Iterator<Cell> cells = row.cellsInLegacyOrder(metadata, false).iterator(); |
| // we don't have (and shouldn't have) row markers for compact tables. |
| private boolean hasReturnedRowMarker = metadata.isCompactTable(); |
| |
| protected LegacyCell computeNext() |
| { |
| if (!hasReturnedRowMarker) |
| { |
| hasReturnedRowMarker = true; |
| |
| // don't include a row marker if there's no timestamp on the primary key; this is the 3.0+ equivalent |
| // of a row marker |
| if (!row.primaryKeyLivenessInfo().isEmpty()) |
| { |
| LegacyCellName cellName = new LegacyCellName(row.clustering(), null, null); |
| LivenessInfo info = row.primaryKeyLivenessInfo(); |
| return new LegacyCell(info.isExpiring() ? LegacyCell.Kind.EXPIRING : LegacyCell.Kind.REGULAR, cellName, ByteBufferUtil.EMPTY_BYTE_BUFFER, info.timestamp(), info.localExpirationTime(), info.ttl()); |
| } |
| } |
| |
| if (!cells.hasNext()) |
| return endOfData(); |
| |
| return makeLegacyCell(row.clustering(), cells.next()); |
| } |
| }; |
| return Pair.create(deletions, cells); |
| } |
| |
| private static LegacyCell makeLegacyCell(Clustering clustering, Cell cell) |
| { |
| LegacyCell.Kind kind; |
| if (cell.isCounterCell()) |
| kind = LegacyCell.Kind.COUNTER; |
| else if (cell.isTombstone()) |
| kind = LegacyCell.Kind.DELETED; |
| else if (cell.isExpiring()) |
| kind = LegacyCell.Kind.EXPIRING; |
| else |
| kind = LegacyCell.Kind.REGULAR; |
| |
| CellPath path = cell.path(); |
| assert path == null || path.size() == 1; |
| LegacyCellName name = new LegacyCellName(clustering, cell.column(), path == null ? null : path.get(0)); |
| return new LegacyCell(kind, name, cell.value(), cell.timestamp(), cell.localDeletionTime(), cell.ttl()); |
| } |
| |
| public static RowIterator toRowIterator(final CFMetaData metadata, |
| final DecoratedKey key, |
| final Iterator<LegacyCell> cells, |
| final int nowInSec) |
| { |
| SerializationHelper helper = new SerializationHelper(metadata, 0, SerializationHelper.Flag.LOCAL); |
| return UnfilteredRowIterators.filter(toUnfilteredRowIterator(metadata, key, LegacyDeletionInfo.live(), cells, false, helper), nowInSec); |
| } |
| |
| public static Comparator<LegacyCell> legacyCellComparator(CFMetaData metadata) |
| { |
| return legacyCellComparator(metadata, false); |
| } |
| |
| public static Comparator<LegacyCell> legacyCellComparator(final CFMetaData metadata, final boolean reversed) |
| { |
| final Comparator<LegacyCellName> cellNameComparator = legacyCellNameComparator(metadata, reversed); |
| return new Comparator<LegacyCell>() |
| { |
| public int compare(LegacyCell cell1, LegacyCell cell2) |
| { |
| LegacyCellName c1 = cell1.name; |
| LegacyCellName c2 = cell2.name; |
| |
| int c = cellNameComparator.compare(c1, c2); |
| if (c != 0) |
| return c; |
| |
| // The actual sorting when the cellname is equal doesn't matter, we just want to make |
| // sure the cells are not considered equal. |
| if (cell1.timestamp != cell2.timestamp) |
| return cell1.timestamp < cell2.timestamp ? -1 : 1; |
| |
| if (cell1.localDeletionTime != cell2.localDeletionTime) |
| return cell1.localDeletionTime < cell2.localDeletionTime ? -1 : 1; |
| |
| return cell1.value.compareTo(cell2.value); |
| } |
| }; |
| } |
| |
| // Note that this doesn't exactly compare cells as they were pre-3.0 because within a row they sort columns like |
| // in 3.0, that is, with simple columns before complex columns. In other words, this comparator makes sure cells |
| // are in the proper order to convert them to actual 3.0 rows. |
| public static Comparator<LegacyCellName> legacyCellNameComparator(final CFMetaData metadata, final boolean reversed) |
| { |
| return new Comparator<LegacyCellName>() |
| { |
| public int compare(LegacyCellName c1, LegacyCellName c2) |
| { |
| // Compare clustering first |
| if (c1.clustering == Clustering.STATIC_CLUSTERING) |
| { |
| if (c2.clustering != Clustering.STATIC_CLUSTERING) |
| return -1; |
| } |
| else if (c2.clustering == Clustering.STATIC_CLUSTERING) |
| { |
| return 1; |
| } |
| else |
| { |
| int c = metadata.comparator.compare(c1.clustering, c2.clustering); |
| if (c != 0) |
| return reversed ? -c : c; |
| } |
| |
| // Note that when reversed, we only care about the clustering being reversed, so it's ok |
| // not to take reversed into account below. |
| |
| // Then check the column name |
| if (c1.column != c2.column) |
| { |
| // A null for the column means it's a row marker |
| if (c1.column == null) |
| return -1; |
| if (c2.column == null) |
| return 1; |
| |
| assert c1.column.isRegular() || c1.column.isStatic(); |
| assert c2.column.isRegular() || c2.column.isStatic(); |
| int cmp = c1.column.compareTo(c2.column); |
| if (cmp != 0) |
| return cmp; |
| } |
| |
| assert (c1.collectionElement == null) == (c2.collectionElement == null); |
| |
| if (c1.collectionElement != null) |
| { |
| AbstractType<?> colCmp = ((CollectionType)c1.column.type).nameComparator(); |
| return colCmp.compare(c1.collectionElement, c2.collectionElement); |
| } |
| return 0; |
| } |
| }; |
| } |
| |
| private static boolean equalValues(ClusteringPrefix c1, ClusteringPrefix c2, ClusteringComparator comparator) |
| { |
| assert c1.size() == c2.size(); |
| for (int i = 0; i < c1.size(); i++) |
| { |
| if (comparator.compareComponent(i, c1.get(i), c2.get(i)) != 0) |
| return false; |
| } |
| return true; |
| } |
| |
| static Comparator<LegacyAtom> legacyAtomComparator(CFMetaData metadata) |
| { |
| return (o1, o2) -> |
| { |
| // First we want to compare by clustering, but we have to be careful with range tombstone, because |
| // we can have collection deletion and we want those to sort properly just before the column they |
| // delete, not before the whole row. |
| // We also want to special case static so they sort before any non-static. Note in particular that |
| // this special casing is important in the case of one of the Atom being Slice.Bound.BOTTOM: we want |
| // it to sort after the static as we deal with static first in toUnfilteredAtomIterator and having |
| // Slice.Bound.BOTTOM first would mess that up (note that static deletion is handled through a specific |
| // static tombstone, see LegacyDeletionInfo.add()). |
| if (o1.isStatic() != o2.isStatic()) |
| return o1.isStatic() ? -1 : 1; |
| |
| ClusteringPrefix c1 = o1.clustering(); |
| ClusteringPrefix c2 = o2.clustering(); |
| |
| int clusteringComparison; |
| if (c1.size() != c2.size() || (o1.isCell() == o2.isCell()) || !equalValues(c1, c2, metadata.comparator)) |
| { |
| clusteringComparison = metadata.comparator.compare(c1, c2); |
| } |
| else |
| { |
| // one is a cell and one is a range tombstone, and both have the same prefix size (that is, the |
| // range tombstone is either a row deletion or a collection deletion). |
| LegacyRangeTombstone rt = o1.isCell() ? o2.asRangeTombstone() : o1.asRangeTombstone(); |
| clusteringComparison = rt.isCollectionTombstone() |
| ? 0 |
| : metadata.comparator.compare(c1, c2); |
| } |
| |
| // Note that if both are range tombstones and have the same clustering, then they are equal. |
| if (clusteringComparison != 0) |
| return clusteringComparison; |
| |
| if (o1.isCell()) |
| { |
| LegacyCell cell1 = o1.asCell(); |
| if (o2.isCell()) |
| { |
| LegacyCell cell2 = o2.asCell(); |
| // Check for row marker cells |
| if (cell1.name.column == null) |
| return cell2.name.column == null ? 0 : -1; |
| return cell2.name.column == null ? 1 : cell1.name.column.compareTo(cell2.name.column); |
| } |
| |
| LegacyRangeTombstone rt2 = o2.asRangeTombstone(); |
| assert rt2.isCollectionTombstone(); // otherwise, we shouldn't have got a clustering equality |
| if (cell1.name.column == null) |
| return -1; |
| int cmp = cell1.name.column.compareTo(rt2.start.collectionName); |
| // If both are for the same column, then the RT should come first |
| return cmp == 0 ? 1 : cmp; |
| } |
| else |
| { |
| assert o2.isCell(); |
| LegacyCell cell2 = o2.asCell(); |
| |
| LegacyRangeTombstone rt1 = o1.asRangeTombstone(); |
| assert rt1.isCollectionTombstone(); // otherwise, we shouldn't have got a clustering equality |
| |
| if (cell2.name.column == null) |
| return 1; |
| |
| int cmp = rt1.start.collectionName.compareTo(cell2.name.column); |
| // If both are for the same column, then the RT should come first |
| return cmp == 0 ? -1 : cmp; |
| } |
| }; |
| } |
| |
| public static LegacyAtom readLegacyAtom(CFMetaData metadata, DataInputPlus in, boolean readAllAsDynamic) |
| throws IOException, UnknownColumnException |
| { |
| ByteBuffer cellname = ByteBufferUtil.readWithShortLength(in); |
| if (!cellname.hasRemaining()) |
| return null; // END_OF_ROW |
| |
| try |
| { |
| int b = in.readUnsignedByte(); |
| return (b & RANGE_TOMBSTONE_MASK) != 0 |
| ? readLegacyRangeTombstoneBody(metadata, in, cellname) |
| : readLegacyCellBody(metadata, in, cellname, b, SerializationHelper.Flag.LOCAL, readAllAsDynamic); |
| } |
| catch (UnknownColumnException e) |
| { |
| // We legitimately can get here in 2 cases: |
| // 1) for system tables, because we've unceremoniously removed columns (without registering them as dropped) |
| // 2) for dropped columns. |
| // In any other case, there is a mismatch between the schema and the data, and we complain loudly in |
| // that case. Note that if we are in a legit case of an unknown column, we want to simply skip that cell, |
| // but we don't do this here and re-throw the exception because the calling code sometimes has to know |
| // about this happening. This does mean code calling this method should handle this case properly. |
| if (!metadata.ksName.equals(SystemKeyspace.NAME) && metadata.getDroppedColumnDefinition(e.columnName) == null) |
| throw new IllegalStateException(String.format("Got cell for unknown column %s in sstable of %s.%s: " + |
| "This suggest a problem with the schema which doesn't list " + |
| "this column. Even if that column was dropped, it should have " + |
| "been listed as such", UTF8Type.instance.compose(e.columnName), metadata.ksName, metadata.cfName), e); |
| |
| throw e; |
| } |
| } |
| |
| public static LegacyCell readLegacyCell(CFMetaData metadata, DataInput in, SerializationHelper.Flag flag) throws IOException, UnknownColumnException |
| { |
| ByteBuffer cellname = ByteBufferUtil.readWithShortLength(in); |
| int b = in.readUnsignedByte(); |
| return readLegacyCellBody(metadata, in, cellname, b, flag, false); |
| } |
| |
| public static LegacyCell readLegacyCellBody(CFMetaData metadata, DataInput in, ByteBuffer cellname, int mask, SerializationHelper.Flag flag, boolean readAllAsDynamic) |
| throws IOException, UnknownColumnException |
| { |
| // Note that we want to call decodeCellName only after we've deserialized other parts, since it can throw |
| // and we want to throw only after having deserialized the full cell. |
| if ((mask & COUNTER_MASK) != 0) |
| { |
| in.readLong(); // timestampOfLastDelete: this has been unused for a long time so we ignore it |
| long ts = in.readLong(); |
| ByteBuffer value = ByteBufferUtil.readWithLength(in); |
| if (flag == SerializationHelper.Flag.FROM_REMOTE || (flag == SerializationHelper.Flag.LOCAL && CounterContext.instance().shouldClearLocal(value))) |
| value = CounterContext.instance().clearAllLocal(value); |
| return new LegacyCell(LegacyCell.Kind.COUNTER, decodeCellName(metadata, cellname, readAllAsDynamic), value, ts, Cell.NO_DELETION_TIME, Cell.NO_TTL); |
| } |
| else if ((mask & EXPIRATION_MASK) != 0) |
| { |
| int ttl = in.readInt(); |
| int expiration = in.readInt(); |
| long ts = in.readLong(); |
| ByteBuffer value = ByteBufferUtil.readWithLength(in); |
| return new LegacyCell(LegacyCell.Kind.EXPIRING, decodeCellName(metadata, cellname, readAllAsDynamic), value, ts, expiration, ttl); |
| } |
| else |
| { |
| long ts = in.readLong(); |
| ByteBuffer value = ByteBufferUtil.readWithLength(in); |
| LegacyCellName name = decodeCellName(metadata, cellname, readAllAsDynamic); |
| return (mask & COUNTER_UPDATE_MASK) != 0 |
| ? new LegacyCell(LegacyCell.Kind.COUNTER, name, CounterContext.instance().createUpdate(ByteBufferUtil.toLong(value)), ts, Cell.NO_DELETION_TIME, Cell.NO_TTL) |
| : ((mask & DELETION_MASK) == 0 |
| ? new LegacyCell(LegacyCell.Kind.REGULAR, name, value, ts, Cell.NO_DELETION_TIME, Cell.NO_TTL) |
| : new LegacyCell(LegacyCell.Kind.DELETED, name, ByteBufferUtil.EMPTY_BYTE_BUFFER, ts, ByteBufferUtil.toInt(value), Cell.NO_TTL)); |
| } |
| } |
| |
| public static LegacyRangeTombstone readLegacyRangeTombstoneBody(CFMetaData metadata, DataInputPlus in, ByteBuffer boundname) throws IOException |
| { |
| LegacyBound min = decodeTombstoneBound(metadata, boundname, true); |
| LegacyBound max = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), false); |
| DeletionTime dt = DeletionTime.serializer.deserialize(in); |
| return new LegacyRangeTombstone(min, max, dt); |
| } |
| |
| public static Iterator<LegacyCell> deserializeCells(final CFMetaData metadata, |
| final DataInput in, |
| final SerializationHelper.Flag flag, |
| final int size) |
| { |
| return new AbstractIterator<LegacyCell>() |
| { |
| private int i = 0; |
| |
| protected LegacyCell computeNext() |
| { |
| if (i >= size) |
| return endOfData(); |
| |
| ++i; |
| try |
| { |
| return readLegacyCell(metadata, in, flag); |
| } |
| catch (UnknownColumnException e) |
| { |
| // We can get there if we read a cell for a dropped column, and if that is the case, |
| // then simply ignore the cell is fine. But also not that we ignore if it's the |
| // system keyspace because for those table we actually remove columns without registering |
| // them in the dropped columns |
| if (metadata.ksName.equals(SystemKeyspace.NAME) || metadata.getDroppedColumnDefinition(e.columnName) != null) |
| return computeNext(); |
| else |
| throw new IOError(e); |
| } |
| catch (IOException e) |
| { |
| throw new IOError(e); |
| } |
| } |
| }; |
| } |
| |
| public static class CellGrouper |
| { |
| /** |
| * The fake TTL used for expired rows that have been compacted. |
| */ |
| private static final int FAKE_TTL = 1; |
| |
| public final CFMetaData metadata; |
| private final boolean isStatic; |
| private final SerializationHelper helper; |
| private final Row.Builder builder; |
| private Clustering clustering; |
| |
| private LegacyRangeTombstone rowDeletion; |
| private LegacyRangeTombstone collectionDeletion; |
| |
| /** |
| * Used to track if we need to add pk liveness info (row marker) when removing invalid legacy cells. |
| * |
| * In 2.1 these invalid cells existed but were not queryable, in this case specifically because they |
| * represented values for clustering key columns that were written as data cells. |
| * |
| * However, the presence (or not) of such cells on an otherwise empty CQL row (or partition) would decide |
| * if an empty result row were returned for the CQL row (or partition). To maintain this behaviour we |
| * insert a row marker containing the liveness info of these invalid cells iff we have no other data |
| * on the row. |
| * |
| * See also CASSANDRA-15365 |
| */ |
| private boolean hasValidCells = false; |
| private LivenessInfo invalidLivenessInfo = null; |
| |
| public CellGrouper(CFMetaData metadata, SerializationHelper helper) |
| { |
| this(metadata, helper, false); |
| } |
| |
| private CellGrouper(CFMetaData metadata, SerializationHelper helper, boolean isStatic) |
| { |
| this.metadata = metadata; |
| this.isStatic = isStatic; |
| this.helper = helper; |
| // We cannot use a sorted builder because we don't have exactly the same ordering in 3.0 and pre-3.0. More precisely, within a row, we |
| // store all simple columns before the complex ones in 3.0, which we use to sort everything sorted by the column name before. Note however |
| // that the unsorted builder won't have to reconcile cells, so the exact value we pass for nowInSec doesn't matter. |
| this.builder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds()); |
| } |
| |
| public static CellGrouper staticGrouper(CFMetaData metadata, SerializationHelper helper) |
| { |
| return new CellGrouper(metadata, helper, true); |
| } |
| |
| public void reset() |
| { |
| this.clustering = null; |
| this.rowDeletion = null; |
| this.collectionDeletion = null; |
| this.invalidLivenessInfo = null; |
| this.hasValidCells = false; |
| } |
| |
| /** |
| * Try adding the provided atom to the currently grouped row. |
| * |
| * @param atom the new atom to try to add. This <b>must</b> be a "row" atom, that is either a cell or a legacy |
| * range tombstone that covers only one row (row deletion) or a subset of it (collection |
| * deletion). Meaning that legacy range tombstone covering multiple rows (that should be handled as |
| * legit range tombstone in the new storage engine) should be handled separately. Atoms should also |
| * be provided in proper clustering order. |
| * @return {@code true} if the provided atom has been "consumed" by this grouper (this does _not_ mean the |
| * atom has been "used" by the grouper as the grouper will skip some shadowed atoms for instance, just |
| * that {@link #getRow()} shouldn't be called just yet if there is more atom in the atom iterator we're |
| * grouping). {@code false} otherwise, that is if the row currently built by this grouper is done |
| * _without_ the provided atom being "consumed" (and so {@link #getRow()} should be called and the |
| * grouper resetted, after which the provided atom should be provided again). |
| */ |
| public boolean addAtom(LegacyAtom atom) |
| { |
| assert atom.isRowAtom(metadata) : "Unexpected non in-row legacy range tombstone " + atom; |
| return atom.isCell() |
| ? addCell(atom.asCell()) |
| : addRangeTombstone(atom.asRangeTombstone()); |
| } |
| |
| private boolean addCell(LegacyCell cell) |
| { |
| if (clustering == null) |
| { |
| clustering = cell.name.clustering; |
| assert !isStatic || clustering == Clustering.STATIC_CLUSTERING; |
| builder.newRow(clustering); |
| } |
| else if (!clustering.equals(cell.name.clustering)) |
| { |
| return false; |
| } |
| |
| // Ignore shadowed cells |
| if (rowDeletion != null && rowDeletion.deletionTime.deletes(cell.timestamp)) |
| return true; |
| |
| ColumnDefinition column = cell.name.column; |
| if (column == null) |
| { |
| // It's the row marker |
| assert !cell.value.hasRemaining(); |
| // In 2.1, the row marker expired cell might have been converted into a deleted one by compaction. |
| // If we do not set the primary key liveness info for this row and it does not contains any regular columns |
| // the row will be empty. To avoid that, we reuse the localDeletionTime but use a fake TTL. |
| // The only time in 2.x that we actually delete a row marker is in 2i tables, so in that case we do |
| // want to actually propagate the row deletion. (CASSANDRA-13320) |
| if (!cell.isTombstone()) |
| builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(cell.timestamp, cell.ttl, cell.localDeletionTime)); |
| else if (metadata.isIndex()) |
| builder.addRowDeletion(Row.Deletion.regular(new DeletionTime(cell.timestamp, cell.localDeletionTime))); |
| else |
| builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(cell.timestamp, FAKE_TTL, cell.localDeletionTime)); |
| hasValidCells = true; |
| } |
| else if (column.isPrimaryKeyColumn() && metadata.isCQLTable()) |
| { |
| // SSTables generated offline and side-loaded may include invalid cells which have the column name |
| // of a primary key column. So that we don't fail when encountering these cells, we treat them the |
| // same way as 2.1 did, namely we include their clusterings in the new CQL row, but drop the invalid |
| // column part of the cell |
| noSpamLogger.warn("Illegal cell name for CQL3 table {}.{}. {} is defined as a primary key column", |
| metadata.ksName, metadata.cfName, column.name); |
| |
| if (invalidLivenessInfo != null) |
| { |
| // when we have several invalid cells we follow the logic in LivenessInfo#supersedes when picking the PKLI to keep: |
| LivenessInfo newInvalidLiveness = LivenessInfo.create(cell.timestamp, cell.isTombstone() ? FAKE_TTL : cell.ttl, cell.localDeletionTime); |
| if (newInvalidLiveness.supersedes(invalidLivenessInfo)) |
| invalidLivenessInfo = newInvalidLiveness; |
| } |
| else |
| { |
| invalidLivenessInfo = LivenessInfo.create(cell.timestamp, cell.isTombstone() ? FAKE_TTL : cell.ttl, cell.localDeletionTime); |
| } |
| return true; |
| } |
| else |
| { |
| if (collectionDeletion != null && collectionDeletion.start.collectionName.name.equals(column.name) && collectionDeletion.deletionTime.deletes(cell.timestamp)) |
| return true; |
| |
| if (helper.includes(column)) |
| { |
| hasValidCells = true; |
| CellPath path = null; |
| if (column.isComplex()) |
| { |
| // Recalling startOfComplexColumn for every cell is a big inefficient, but it's ok in practice |
| // and it's simpler. And since 1) this only matter for super column selection in thrift in |
| // practice and 2) is only used during upgrade, it's probably worth keeping things simple. |
| helper.startOfComplexColumn(column); |
| path = cell.name.collectionElement == null ? null : CellPath.create(cell.name.collectionElement); |
| if (!helper.includes(path)) |
| return true; |
| } |
| column.type.validateIfFixedSize(cell.value); |
| Cell c = new BufferCell(column, cell.timestamp, cell.ttl, cell.localDeletionTime, cell.value, path); |
| if (!helper.isDropped(c, column.isComplex())) |
| builder.addCell(c); |
| if (column.isComplex()) |
| { |
| helper.endOfComplexColumn(); |
| } |
| } |
| } |
| return true; |
| } |
| |
| private boolean addRangeTombstone(LegacyRangeTombstone tombstone) |
| { |
| if (tombstone.isRowDeletion(metadata)) |
| { |
| return addRowTombstone(tombstone); |
| } |
| else |
| { |
| // The isRowAtom() assertion back in addAtom would have already triggered otherwise, but spelling it |
| // out nonetheless. |
| assert tombstone.isCollectionTombstone(); |
| return addCollectionTombstone(tombstone); |
| } |
| } |
| |
| private boolean addRowTombstone(LegacyRangeTombstone tombstone) |
| { |
| if (clustering != null) |
| { |
| // If we're already in the row, there might be a chance that there were two range tombstones |
| // written, as 2.x storage format does not guarantee just one range tombstone, unlike 3.x. |
| // We have to make sure that clustering matches, which would mean that tombstone is for the |
| // same row. |
| if (clustering.equals(tombstone.start.getAsClustering(metadata))) |
| { |
| // If the tombstone superceeds the previous delete, we discard the previous one. |
| // This assumes that we are building the row from a sane source (ie, this row deletion |
| // does not delete anything already added to the builder). See CASSANDRA-15789 for details |
| if (rowDeletion == null || tombstone.deletionTime.supersedes(rowDeletion.deletionTime)) |
| { |
| builder.addRowDeletion(Row.Deletion.regular(tombstone.deletionTime)); |
| rowDeletion = tombstone; |
| hasValidCells = true; |
| } |
| return true; |
| } |
| |
| // different clustering -> new row |
| return false; |
| } |
| |
| clustering = tombstone.start.getAsClustering(metadata); |
| builder.newRow(clustering); |
| builder.addRowDeletion(Row.Deletion.regular(tombstone.deletionTime)); |
| rowDeletion = tombstone; |
| hasValidCells = true; |
| |
| return true; |
| } |
| |
| private boolean addCollectionTombstone(LegacyRangeTombstone tombstone) |
| { |
| // If the collection tombstone is not included in the query (which technically would only apply to thrift |
| // queries since CQL one "fetch" everything), we can skip it (so return), but we're problably still within |
| // the current row so we return `true`. Technically, it is possible that tombstone belongs to another row |
| // that the row currently grouped, but as we ignore it, returning `true` is ok in that case too. |
| if (!helper.includes(tombstone.start.collectionName)) |
| return true; // see CASSANDRA-13109 |
| |
| // The helper needs to be informed about the current complex column identifier before |
| // it can perform the comparison between the recorded drop time and the RT deletion time. |
| // If the RT has been superceded by a drop, we still return true as we don't want the |
| // grouper to terminate yet. |
| helper.startOfComplexColumn(tombstone.start.collectionName); |
| if (helper.isDroppedComplexDeletion(tombstone.deletionTime)) |
| return true; |
| |
| if (clustering == null) |
| { |
| clustering = tombstone.start.getAsClustering(metadata); |
| builder.newRow(clustering); |
| } |
| else if (!clustering.equals(tombstone.start.getAsClustering(metadata))) |
| { |
| return false; |
| } |
| |
| builder.addComplexDeletion(tombstone.start.collectionName, tombstone.deletionTime); |
| if (rowDeletion == null || tombstone.deletionTime.supersedes(rowDeletion.deletionTime)) |
| collectionDeletion = tombstone; |
| hasValidCells = true; |
| |
| return true; |
| } |
| |
| /** |
| * Whether the provided range tombstone starts strictly after the current row of the cell grouper (if no row is |
| * currently started, this return false). |
| */ |
| public boolean startsAfterCurrentRow(LegacyRangeTombstone rangeTombstone) |
| { |
| return clustering != null && metadata.comparator.compare(rangeTombstone.start.bound, clustering) > 0; |
| } |
| |
| /** |
| * The clustering of the current row of the cell grouper, or {@code null} if no row is currently started. |
| */ |
| public Clustering currentRowClustering() |
| { |
| return clustering; |
| } |
| |
| /** |
| * Generates the row currently grouped by this grouper and reset it for the following row. |
| * <p> |
| * Note that the only correct way to call this is when either all the atom we're trying to group has been |
| * consumed, or when {@link #addAtom(LegacyAtom)} returns {@code false}. |
| * |
| * @return the current row that has been grouped, or {@code null} in the rare case where all the atoms |
| * "consumed" by {@link #addAtom(LegacyAtom)} for this row were skipped (we skip atoms under a few conditions). |
| */ |
| public Row getRow() |
| { |
| if (clustering == null) |
| return null; |
| if (!hasValidCells && invalidLivenessInfo != null) |
| builder.addPrimaryKeyLivenessInfo(invalidLivenessInfo); |
| return builder.build(); |
| } |
| } |
| |
| public static class LegacyUnfilteredPartition |
| { |
| public final DeletionTime partitionDeletion; |
| public final LegacyRangeTombstoneList rangeTombstones; |
| public final List<LegacyCell> cells; |
| |
| private LegacyUnfilteredPartition(DeletionTime partitionDeletion, LegacyRangeTombstoneList rangeTombstones, List<LegacyCell> cells) |
| { |
| this.partitionDeletion = partitionDeletion; |
| this.rangeTombstones = rangeTombstones; |
| this.cells = cells; |
| } |
| |
| public void digest(CFMetaData metadata, MessageDigest digest) |
| { |
| for (LegacyCell cell : cells) |
| { |
| digest.update(cell.name.encode(metadata).duplicate()); |
| |
| if (cell.isCounter()) |
| CounterContext.instance().updateDigest(digest, cell.value); |
| else |
| digest.update(cell.value.duplicate()); |
| |
| FBUtilities.updateWithLong(digest, cell.timestamp); |
| FBUtilities.updateWithByte(digest, cell.serializationFlags()); |
| |
| if (cell.isExpiring()) |
| FBUtilities.updateWithInt(digest, cell.ttl); |
| |
| if (cell.isCounter()) |
| { |
| // Counters used to have the timestampOfLastDelete field, which we stopped using long ago and has been hard-coded |
| // to Long.MIN_VALUE but was still taken into account in 2.2 counter digests (to maintain backward compatibility |
| // in the first place). |
| FBUtilities.updateWithLong(digest, Long.MIN_VALUE); |
| } |
| } |
| |
| if (partitionDeletion.markedForDeleteAt() != Long.MIN_VALUE) |
| digest.update(ByteBufferUtil.bytes(partitionDeletion.markedForDeleteAt())); |
| |
| if (!rangeTombstones.isEmpty()) |
| rangeTombstones.updateDigest(digest); |
| } |
| } |
| |
| public static class LegacyCellName |
| { |
| public final Clustering clustering; |
| public final ColumnDefinition column; |
| public final ByteBuffer collectionElement; |
| |
| @VisibleForTesting |
| public LegacyCellName(Clustering clustering, ColumnDefinition column, ByteBuffer collectionElement) |
| { |
| this.clustering = clustering; |
| this.column = column; |
| this.collectionElement = collectionElement; |
| } |
| |
| public static LegacyCellName create(Clustering clustering, ColumnDefinition column) |
| { |
| return new LegacyCellName(clustering, column, null); |
| } |
| |
| public ByteBuffer encode(CFMetaData metadata) |
| { |
| return encodeCellName(metadata, clustering, column == null ? ByteBufferUtil.EMPTY_BYTE_BUFFER : column.name.bytes, collectionElement); |
| } |
| |
| public ByteBuffer superColumnSubName() |
| { |
| assert collectionElement != null; |
| return collectionElement; |
| } |
| |
| public ByteBuffer superColumnName() |
| { |
| return clustering.get(0); |
| } |
| |
| @Override |
| public String toString() |
| { |
| StringBuilder sb = new StringBuilder(); |
| for (int i = 0; i < clustering.size(); i++) |
| sb.append(i > 0 ? ":" : "").append(clustering.get(i) == null ? "null" : ByteBufferUtil.bytesToHex(clustering.get(i))); |
| return String.format("Cellname(clustering=%s, column=%s, collElt=%s)", sb.toString(), column == null ? "null" : column.name, collectionElement == null ? "null" : ByteBufferUtil.bytesToHex(collectionElement)); |
| } |
| } |
| |
| public static class LegacyBound |
| { |
| public static final LegacyBound BOTTOM = new LegacyBound(Slice.Bound.BOTTOM, false, null); |
| public static final LegacyBound TOP = new LegacyBound(Slice.Bound.TOP, false, null); |
| |
| public final Slice.Bound bound; |
| public final boolean isStatic; |
| public final ColumnDefinition collectionName; |
| |
| public LegacyBound(Slice.Bound bound, boolean isStatic, ColumnDefinition collectionName) |
| { |
| this.bound = bound; |
| this.isStatic = isStatic; |
| this.collectionName = collectionName; |
| } |
| |
| public Clustering getAsClustering(CFMetaData metadata) |
| { |
| if (isStatic) |
| return Clustering.STATIC_CLUSTERING; |
| |
| assert bound.size() == metadata.comparator.size(); |
| ByteBuffer[] values = new ByteBuffer[bound.size()]; |
| for (int i = 0; i < bound.size(); i++) |
| values[i] = bound.get(i); |
| return new Clustering(values); |
| } |
| |
| @Override |
| public String toString() |
| { |
| StringBuilder sb = new StringBuilder(); |
| sb.append(bound.kind()).append('('); |
| for (int i = 0; i < bound.size(); i++) |
| sb.append(i > 0 ? ":" : "").append(bound.get(i) == null ? "null" : ByteBufferUtil.bytesToHex(bound.get(i))); |
| sb.append(')'); |
| return String.format("Bound(%s, collection=%s)", sb.toString(), collectionName == null ? "null" : collectionName.name); |
| } |
| } |
| |
| public interface LegacyAtom |
| { |
| public boolean isCell(); |
| |
| // note that for static atoms, LegacyCell and LegacyRangeTombstone behave differently here: |
| // - LegacyCell returns the modern Clustering.STATIC_CLUSTERING |
| // - LegacyRangeTombstone returns the 2.2 bound (i.e. N empty ByteBuffer, where N is number of clusterings) |
| // in LegacyDeletionInfo.add(), we split any LRT with a static bound out into the inRowRangeTombstones collection |
| // these are merged with regular row cells, in the CellGrouper, and their clustering is obtained via start.bound.getAsClustering |
| // (also, it should be impossibly to issue raw static row deletions anyway) |
| public ClusteringPrefix clustering(); |
| public boolean isStatic(); |
| |
| public LegacyCell asCell(); |
| public LegacyRangeTombstone asRangeTombstone(); |
| |
| /** |
| * Whether the atom is one that becomes part of a {@link Row} in the new storage engine, meaning it is either |
| * as cell or a legacy range tombstone that covers a single row, or parts of one. |
| */ |
| public boolean isRowAtom(CFMetaData metadata); |
| } |
| |
| /** |
| * A legacy cell. |
| * <p> |
| * This is used as a temporary object to facilitate dealing with the legacy format, this |
| * is not meant to be optimal. |
| */ |
| public static class LegacyCell implements LegacyAtom |
| { |
| private final static int DELETION_MASK = 0x01; |
| private final static int EXPIRATION_MASK = 0x02; |
| private final static int COUNTER_MASK = 0x04; |
| private final static int COUNTER_UPDATE_MASK = 0x08; |
| private final static int RANGE_TOMBSTONE_MASK = 0x10; |
| |
| public enum Kind { REGULAR, EXPIRING, DELETED, COUNTER } |
| |
| public final Kind kind; |
| |
| public final LegacyCellName name; |
| public final ByteBuffer value; |
| |
| public final long timestamp; |
| public final int localDeletionTime; |
| public final int ttl; |
| |
| @VisibleForTesting |
| public LegacyCell(Kind kind, LegacyCellName name, ByteBuffer value, long timestamp, int localDeletionTime, int ttl) |
| { |
| this.kind = kind; |
| this.name = name; |
| this.value = value; |
| this.timestamp = timestamp; |
| this.localDeletionTime = localDeletionTime; |
| this.ttl = ttl; |
| } |
| |
| public static LegacyCell regular(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, ByteBuffer value, long timestamp) |
| throws UnknownColumnException |
| { |
| return new LegacyCell(Kind.REGULAR, decodeCellName(metadata, superColumnName, name), value, timestamp, Cell.NO_DELETION_TIME, Cell.NO_TTL); |
| } |
| |
| public static LegacyCell expiring(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, ByteBuffer value, long timestamp, int ttl, int nowInSec) |
| throws UnknownColumnException |
| { |
| /* |
| * CASSANDRA-14092: Max expiration date capping is maybe performed here, expiration overflow policy application |
| * is done at {@link org.apache.cassandra.thrift.ThriftValidation#validateTtl(CFMetaData, Column)} |
| */ |
| return new LegacyCell(Kind.EXPIRING, decodeCellName(metadata, superColumnName, name), value, timestamp, ExpirationDateOverflowHandling.computeLocalExpirationTime(nowInSec, ttl), ttl); |
| } |
| |
| public static LegacyCell tombstone(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, long timestamp, int nowInSec) |
| throws UnknownColumnException |
| { |
| return new LegacyCell(Kind.DELETED, decodeCellName(metadata, superColumnName, name), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp, nowInSec, LivenessInfo.NO_TTL); |
| } |
| |
| public static LegacyCell counterUpdate(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, long value) |
| throws UnknownColumnException |
| { |
| // See UpdateParameters.addCounter() for more details on this |
| ByteBuffer counterValue = CounterContext.instance().createUpdate(value); |
| return counter(decodeCellName(metadata, superColumnName, name), counterValue); |
| } |
| |
| public static LegacyCell counter(LegacyCellName name, ByteBuffer value) |
| { |
| return new LegacyCell(Kind.COUNTER, name, value, FBUtilities.timestampMicros(), Cell.NO_DELETION_TIME, Cell.NO_TTL); |
| } |
| |
| public byte serializationFlags() |
| { |
| if (isExpiring()) |
| return EXPIRATION_MASK; |
| if (isTombstone()) |
| return DELETION_MASK; |
| if (isCounterUpdate()) |
| return COUNTER_UPDATE_MASK; |
| if (isCounter()) |
| return COUNTER_MASK; |
| return 0; |
| } |
| |
| public boolean isCounterUpdate() |
| { |
| // See UpdateParameters.addCounter() for more details on this |
| return isCounter() && CounterContext.instance().isUpdate(value); |
| } |
| |
| public ClusteringPrefix clustering() |
| { |
| return name.clustering; |
| } |
| |
| public boolean isStatic() |
| { |
| return name.clustering == Clustering.STATIC_CLUSTERING; |
| } |
| |
| public boolean isCell() |
| { |
| return true; |
| } |
| |
| public LegacyCell asCell() |
| { |
| return this; |
| } |
| |
| public LegacyRangeTombstone asRangeTombstone() |
| { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public boolean isRowAtom(CFMetaData metaData) |
| { |
| return true; |
| } |
| |
| public boolean isCounter() |
| { |
| return kind == Kind.COUNTER; |
| } |
| |
| public boolean isExpiring() |
| { |
| return kind == Kind.EXPIRING; |
| } |
| |
| public boolean isTombstone() |
| { |
| return kind == Kind.DELETED; |
| } |
| |
| public boolean isLive(int nowInSec) |
| { |
| if (isTombstone()) |
| return false; |
| |
| return !isExpiring() || nowInSec < localDeletionTime; |
| } |
| |
| @Override |
| public String toString() |
| { |
| return String.format("LegacyCell(%s, name=%s, v=%s, ts=%s, ldt=%s, ttl=%s)", kind, name, ByteBufferUtil.bytesToHex(value), timestamp, localDeletionTime, ttl); |
| } |
| } |
| |
| /** |
| * A legacy range tombstone. |
| * <p> |
| * This is used as a temporary object to facilitate dealing with the legacy format, this |
| * is not meant to be optimal. |
| */ |
| public static class LegacyRangeTombstone implements LegacyAtom |
| { |
| public final LegacyBound start; |
| public final LegacyBound stop; |
| public final DeletionTime deletionTime; |
| |
| public LegacyRangeTombstone(LegacyBound start, LegacyBound stop, DeletionTime deletionTime) |
| { |
| // Because of the way RangeTombstoneList work, we can have a tombstone where only one of |
| // the bound has a collectionName. That happens if we have a big tombstone A (spanning one |
| // or multiple rows) and a collection tombstone B. In that case, RangeTombstoneList will |
| // split this into 3 RTs: the first one from the beginning of A to the beginning of B, |
| // then B, then a third one from the end of B to the end of A. To make this simpler, if |
| // we detect that case we transform the 1st and 3rd tombstone so they don't end in the middle |
| // of a row (which is still correct). |
| if ((start.collectionName == null) != (stop.collectionName == null)) |
| { |
| if (start.collectionName == null) |
| stop = new LegacyBound(Slice.Bound.inclusiveEndOf(stop.bound.values), stop.isStatic, null); |
| else |
| start = new LegacyBound(Slice.Bound.inclusiveStartOf(start.bound.values), start.isStatic, null); |
| } |
| else if (!Objects.equals(start.collectionName, stop.collectionName)) |
| { |
| // We're in the similar but slightly more complex case where on top of the big tombstone |
| // A, we have 2 (or more) collection tombstones B and C within A. So we also end up with |
| // a tombstone that goes between the end of B and the start of C. |
| start = new LegacyBound(start.bound, start.isStatic, null); |
| stop = new LegacyBound(stop.bound, stop.isStatic, null); |
| } |
| |
| this.start = start; |
| this.stop = stop; |
| this.deletionTime = deletionTime; |
| } |
| |
| /** @see LegacyAtom#clustering for static inconsistencies explained */ |
| public ClusteringPrefix clustering() |
| { |
| return start.bound; |
| } |
| |
| public LegacyRangeTombstone withNewStart(LegacyBound newStart) |
| { |
| return new LegacyRangeTombstone(newStart, stop, deletionTime); |
| } |
| |
| public LegacyRangeTombstone withNewStart(Slice.Bound newStart) |
| { |
| return withNewStart(new LegacyBound(newStart, start.isStatic, null)); |
| } |
| |
| public LegacyRangeTombstone withNewEnd(LegacyBound newStop) |
| { |
| return new LegacyRangeTombstone(start, newStop, deletionTime); |
| } |
| |
| public LegacyRangeTombstone withNewEnd(Slice.Bound newEnd) |
| { |
| return withNewEnd(new LegacyBound(newEnd, stop.isStatic, null)); |
| } |
| |
| public boolean isCell() |
| { |
| return false; |
| } |
| |
| public boolean isStatic() |
| { |
| return start.isStatic || stop.isStatic; |
| } |
| |
| public LegacyCell asCell() |
| { |
| throw new UnsupportedOperationException(); |
| } |
| |
| public LegacyRangeTombstone asRangeTombstone() |
| { |
| return this; |
| } |
| |
| @Override |
| public boolean isRowAtom(CFMetaData metadata) |
| { |
| return isCollectionTombstone() || isRowDeletion(metadata); |
| } |
| |
| public boolean isCollectionTombstone() |
| { |
| return start.collectionName != null; |
| } |
| |
| public boolean isRowDeletion(CFMetaData metadata) |
| { |
| if (start.collectionName != null |
| || stop.collectionName != null |
| || start.bound.size() != metadata.comparator.size() |
| || stop.bound.size() != metadata.comparator.size()) |
| return false; |
| |
| for (int i = 0; i < start.bound.size(); i++) |
| if (!Objects.equals(start.bound.get(i), stop.bound.get(i))) |
| return false; |
| return true; |
| } |
| |
| @Override |
| public String toString() |
| { |
| return String.format("RT(%s-%s, %s)", start, stop, deletionTime); |
| } |
| } |
| |
| public static class LegacyDeletionInfo |
| { |
| public final MutableDeletionInfo deletionInfo; |
| public final List<LegacyRangeTombstone> inRowTombstones = new ArrayList<>(); |
| |
| private LegacyDeletionInfo(MutableDeletionInfo deletionInfo) |
| { |
| this.deletionInfo = deletionInfo; |
| } |
| |
| public static LegacyDeletionInfo live() |
| { |
| return new LegacyDeletionInfo(MutableDeletionInfo.live()); |
| } |
| |
| public void add(DeletionTime topLevel) |
| { |
| deletionInfo.add(topLevel); |
| } |
| |
| private static Slice.Bound staticBound(CFMetaData metadata, boolean isStart) |
| { |
| // In pre-3.0 nodes, static row started by a clustering with all empty values so we |
| // preserve that here. Note that in practice, it doesn't really matter since the rest |
| // of the code will ignore the bound for RT that have their static flag set. |
| ByteBuffer[] values = new ByteBuffer[metadata.comparator.size()]; |
| for (int i = 0; i < values.length; i++) |
| values[i] = ByteBufferUtil.EMPTY_BYTE_BUFFER; |
| return isStart |
| ? Slice.Bound.inclusiveStartOf(values) |
| : Slice.Bound.inclusiveEndOf(values); |
| } |
| |
| public void add(CFMetaData metadata, LegacyRangeTombstone tombstone) |
| { |
| if (metadata.hasStaticColumns()) |
| { |
| /* |
| * For table having static columns we have to deal with the following cases: |
| * 1. the end of the tombstone is static (in which case either the start is static or is BOTTOM, which is the same |
| * for our consideration). This mean that either the range only delete the static row, or that it's a collection |
| * tombstone of a static collection. In both case, we just add the tombstone to the inRowTombstones. |
| * 2. only the start is static. There is then 2 subcase: either the start is inclusive, and that mean we include the |
| * static row and more (so we add an inRowTombstone for the static and deal with the rest normally). Or the start |
| * is exclusive, and that means we explicitely exclude the static (in which case we can just add the tombstone |
| * as if it started at BOTTOM). |
| * 3. none of the bound are static but the start is BOTTOM. This means we intended to delete the static row so we |
| * need to add it to the inRowTombstones (and otherwise handle the range normally). |
| */ |
| if (tombstone.stop.isStatic) |
| { |
| // If the start is BOTTOM, we replace it by the beginning of the starting row so as to not confuse the |
| // RangeTombstone.isRowDeletion() method |
| if (tombstone.start == LegacyBound.BOTTOM) |
| tombstone = tombstone.withNewStart(new LegacyBound(staticBound(metadata, true), true, null)); |
| inRowTombstones.add(tombstone); |
| return; |
| } |
| |
| if (tombstone.start.isStatic) |
| { |
| if (tombstone.start.bound.isInclusive()) |
| inRowTombstones.add(tombstone.withNewEnd(new LegacyBound(staticBound(metadata, false), true, null))); |
| |
| tombstone = tombstone.withNewStart(LegacyBound.BOTTOM); |
| } |
| else if (tombstone.start == LegacyBound.BOTTOM) |
| { |
| inRowTombstones.add(new LegacyRangeTombstone(new LegacyBound(staticBound(metadata, true), true, null), |
| new LegacyBound(staticBound(metadata, false), true, null), |
| tombstone.deletionTime)); |
| } |
| } |
| |
| if (tombstone.isCollectionTombstone() || tombstone.isRowDeletion(metadata)) |
| inRowTombstones.add(tombstone); |
| else |
| add(metadata, new RangeTombstone(Slice.make(tombstone.start.bound, tombstone.stop.bound), tombstone.deletionTime)); |
| } |
| |
| public void add(CFMetaData metadata, RangeTombstone tombstone) |
| { |
| deletionInfo.add(tombstone, metadata.comparator); |
| } |
| |
| public Iterator<LegacyRangeTombstone> inRowRangeTombstones() |
| { |
| return inRowTombstones.iterator(); |
| } |
| |
| public static LegacyDeletionInfo deserialize(CFMetaData metadata, DataInputPlus in) throws IOException |
| { |
| DeletionTime topLevel = DeletionTime.serializer.deserialize(in); |
| |
| int rangeCount = in.readInt(); |
| if (rangeCount == 0) |
| return new LegacyDeletionInfo(new MutableDeletionInfo(topLevel)); |
| |
| LegacyDeletionInfo delInfo = new LegacyDeletionInfo(new MutableDeletionInfo(topLevel)); |
| for (int i = 0; i < rangeCount; i++) |
| { |
| LegacyBound start = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), true); |
| LegacyBound end = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), false); |
| int delTime = in.readInt(); |
| long markedAt = in.readLong(); |
| |
| delInfo.add(metadata, new LegacyRangeTombstone(start, end, new DeletionTime(markedAt, delTime))); |
| } |
| return delInfo; |
| } |
| } |
| |
| /** |
| * A helper class for LegacyRangeTombstoneList. This replaces the Comparator<Composite> that RTL used before 3.0. |
| */ |
| private static class LegacyBoundComparator implements Comparator<LegacyBound> |
| { |
| ClusteringComparator clusteringComparator; |
| |
| public LegacyBoundComparator(ClusteringComparator clusteringComparator) |
| { |
| this.clusteringComparator = clusteringComparator; |
| } |
| |
| public int compare(LegacyBound a, LegacyBound b) |
| { |
| // In the legacy sorting, BOTTOM comes before anything else |
| if (a == LegacyBound.BOTTOM) |
| return b == LegacyBound.BOTTOM ? 0 : -1; |
| if (b == LegacyBound.BOTTOM) |
| return 1; |
| |
| // Excluding BOTTOM, statics are always before anything else. |
| if (a.isStatic != b.isStatic) |
| return a.isStatic ? -1 : 1; |
| |
| // We have to be careful with bound comparison because of collections. Namely, if the 2 bounds represent the |
| // same prefix, then we should take the collectionName into account before taking the bounds kind |
| // (ClusteringPrefix.Kind). This means we can't really call ClusteringComparator.compare() directly. |
| // For instance, if |
| // a is (bound=INCL_START_BOUND('x'), collectionName='d') |
| // b is (bound=INCL_END_BOUND('x'), collectionName='c') |
| // Ten b < a since the element 'c' of collection 'x' comes before element 'd', but calling |
| // clusteringComparator.compare(a.bound, b.bound) returns -1. |
| // See CASSANDRA-13125 for details. |
| int sa = a.bound.size(); |
| int sb = b.bound.size(); |
| for (int i = 0; i < Math.min(sa, sb); i++) |
| { |
| int cmp = clusteringComparator.compareComponent(i, a.bound.get(i), b.bound.get(i)); |
| if (cmp != 0) |
| return cmp; |
| } |
| |
| if (sa != sb) |
| return sa < sb ? a.bound.kind().comparedToClustering : -b.bound.kind().comparedToClustering; |
| |
| // Both bound represent the same prefix, compare the collection names |
| // If one has a collection name and the other doesn't, the other comes before as it points to the beginning of the row. |
| if ((a.collectionName == null) != (b.collectionName == null)) |
| return a.collectionName == null ? -1 : 1; |
| |
| // If they both have a collection, compare that first |
| if (a.collectionName != null) |
| { |
| int cmp = UTF8Type.instance.compare(a.collectionName.name.bytes, b.collectionName.name.bytes); |
| if (cmp != 0) |
| return cmp; |
| } |
| |
| // Lastly, if everything so far is equal, compare their clustering kind |
| return ClusteringPrefix.Kind.compare(a.bound.kind(), b.bound.kind()); |
| } |
| } |
| |
| /** |
| * Almost an entire copy of RangeTombstoneList from C* 2.1. The main difference is that LegacyBoundComparator |
| * is used in place of Comparator<Composite> (because Composite doesn't exist any more). |
| * |
| * This class is needed to allow us to convert single-row deletions and complex deletions into range tombstones |
| * and properly merge them into the normal set of range tombstones. |
| */ |
| public static class LegacyRangeTombstoneList |
| { |
| private final LegacyBoundComparator comparator; |
| |
| // Note: we don't want to use a List for the markedAts and delTimes to avoid boxing. We could |
| // use a List for starts and ends, but having arrays everywhere is almost simpler. |
| LegacyBound[] starts; |
| LegacyBound[] ends; |
| private long[] markedAts; |
| private int[] delTimes; |
| |
| private int size; |
| |
| private LegacyRangeTombstoneList(LegacyBoundComparator comparator, LegacyBound[] starts, LegacyBound[] ends, long[] markedAts, int[] delTimes, int size) |
| { |
| assert starts.length == ends.length && starts.length == markedAts.length && starts.length == delTimes.length; |
| this.comparator = comparator; |
| this.starts = starts; |
| this.ends = ends; |
| this.markedAts = markedAts; |
| this.delTimes = delTimes; |
| this.size = size; |
| } |
| |
| public LegacyRangeTombstoneList(LegacyBoundComparator comparator, int capacity) |
| { |
| this(comparator, new LegacyBound[capacity], new LegacyBound[capacity], new long[capacity], new int[capacity], 0); |
| } |
| |
| @Override |
| public String toString() |
| { |
| StringBuilder sb = new StringBuilder(); |
| sb.append('['); |
| for (int i = 0; i < size; i++) |
| { |
| if (i > 0) |
| sb.append(','); |
| sb.append('(').append(starts[i]).append(", ").append(ends[i]).append(')'); |
| } |
| return sb.append(']').toString(); |
| } |
| |
| public boolean isEmpty() |
| { |
| return size == 0; |
| } |
| |
| public int size() |
| { |
| return size; |
| } |
| |
| /** |
| * Adds a new range tombstone. |
| * |
| * This method will be faster if the new tombstone sort after all the currently existing ones (this is a common use case), |
| * but it doesn't assume it. |
| */ |
| public void add(LegacyBound start, LegacyBound end, long markedAt, int delTime) |
| { |
| if (isEmpty()) |
| { |
| addInternal(0, start, end, markedAt, delTime); |
| return; |
| } |
| |
| int c = comparator.compare(ends[size-1], start); |
| |
| // Fast path if we add in sorted order |
| if (c <= 0) |
| { |
| addInternal(size, start, end, markedAt, delTime); |
| } |
| else |
| { |
| // Note: insertFrom expect i to be the insertion point in term of interval ends |
| int pos = Arrays.binarySearch(ends, 0, size, start, comparator); |
| insertFrom((pos >= 0 ? pos : -pos-1), start, end, markedAt, delTime); |
| } |
| } |
| |
| /* |
| * Inserts a new element starting at index i. This method assumes that: |
| * ends[i-1] <= start <= ends[i] |
| * |
| * A RangeTombstoneList is a list of range [s_0, e_0]...[s_n, e_n] such that: |
| * - s_i <= e_i |
| * - e_i <= s_i+1 |
| * - if s_i == e_i and e_i == s_i+1 then s_i+1 < e_i+1 |
| * Basically, range are non overlapping except for their bound and in order. And while |
| * we allow ranges with the same value for the start and end, we don't allow repeating |
| * such range (so we can't have [0, 0][0, 0] even though it would respect the first 2 |
| * conditions). |
| * |
| */ |
| |
| /** |
| * Adds all the range tombstones of {@code tombstones} to this RangeTombstoneList. |
| */ |
| public void addAll(LegacyRangeTombstoneList tombstones) |
| { |
| if (tombstones.isEmpty()) |
| return; |
| |
| if (isEmpty()) |
| { |
| copyArrays(tombstones, this); |
| return; |
| } |
| |
| /* |
| * We basically have 2 techniques we can use here: either we repeatedly call add() on tombstones values, |
| * or we do a merge of both (sorted) lists. If this lists is bigger enough than the one we add, then |
| * calling add() will be faster, otherwise it's merging that will be faster. |
| * |
| * Let's note that during memtables updates, it might not be uncommon that a new update has only a few range |
| * tombstones, while the CF we're adding it to (the one in the memtable) has many. In that case, using add() is |
| * likely going to be faster. |
| * |
| * In other cases however, like when diffing responses from multiple nodes, the tombstone lists we "merge" will |
| * be likely sized, so using add() might be a bit inefficient. |
| * |
| * Roughly speaking (this ignore the fact that updating an element is not exactly constant but that's not a big |
| * deal), if n is the size of this list and m is tombstones size, merging is O(n+m) while using add() is O(m*log(n)). |
| * |
| * But let's not crank up a logarithm computation for that. Long story short, merging will be a bad choice only |
| * if this list size is lot bigger that the other one, so let's keep it simple. |
| */ |
| if (size > 10 * tombstones.size) |
| { |
| for (int i = 0; i < tombstones.size; i++) |
| add(tombstones.starts[i], tombstones.ends[i], tombstones.markedAts[i], tombstones.delTimes[i]); |
| } |
| else |
| { |
| int i = 0; |
| int j = 0; |
| while (i < size && j < tombstones.size) |
| { |
| if (comparator.compare(tombstones.starts[j], ends[i]) <= 0) |
| { |
| insertFrom(i, tombstones.starts[j], tombstones.ends[j], tombstones.markedAts[j], tombstones.delTimes[j]); |
| j++; |
| } |
| else |
| { |
| i++; |
| } |
| } |
| // Addds the remaining ones from tombstones if any (note that addInternal will increment size if relevant). |
| for (; j < tombstones.size; j++) |
| addInternal(size, tombstones.starts[j], tombstones.ends[j], tombstones.markedAts[j], tombstones.delTimes[j]); |
| } |
| } |
| |
| private static void copyArrays(LegacyRangeTombstoneList src, LegacyRangeTombstoneList dst) |
| { |
| dst.grow(src.size); |
| System.arraycopy(src.starts, 0, dst.starts, 0, src.size); |
| System.arraycopy(src.ends, 0, dst.ends, 0, src.size); |
| System.arraycopy(src.markedAts, 0, dst.markedAts, 0, src.size); |
| System.arraycopy(src.delTimes, 0, dst.delTimes, 0, src.size); |
| dst.size = src.size; |
| } |
| |
| private void insertFrom(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime) |
| { |
| while (i < size) |
| { |
| assert i == 0 || comparator.compare(ends[i-1], start) <= 0; |
| |
| int c = comparator.compare(start, ends[i]); |
| assert c <= 0; |
| if (c == 0) |
| { |
| // If start == ends[i], then we can insert from the next one (basically the new element |
| // really start at the next element), except for the case where starts[i] == ends[i]. |
| // In this latter case, if we were to move to next element, we could end up with ...[x, x][x, x]... |
| if (comparator.compare(starts[i], ends[i]) == 0) |
| { |
| // The current element cover a single value which is equal to the start of the inserted |
| // element. If the inserted element overwrites the current one, just remove the current |
| // (it's included in what we insert) and proceed with the insert. |
| if (markedAt > markedAts[i]) |
| { |
| removeInternal(i); |
| continue; |
| } |
| |
| // Otherwise (the current singleton interval override the new one), we want to leave the |
| // current element and move to the next, unless start == end since that means the new element |
| // is in fact fully covered by the current one (so we're done) |
| if (comparator.compare(start, end) == 0) |
| return; |
| } |
| i++; |
| continue; |
| } |
| |
| // Do we overwrite the current element? |
| if (markedAt > markedAts[i]) |
| { |
| // We do overwrite. |
| |
| // First deal with what might come before the newly added one. |
| if (comparator.compare(starts[i], start) < 0) |
| { |
| addInternal(i, starts[i], start, markedAts[i], delTimes[i]); |
| i++; |
| // We don't need to do the following line, but in spirit that's what we want to do |
| // setInternal(i, start, ends[i], markedAts, delTime]) |
| } |
| |
| // now, start <= starts[i] |
| |
| // Does the new element stops before/at the current one, |
| int endCmp = comparator.compare(end, starts[i]); |
| if (endCmp <= 0) |
| { |
| // Here start <= starts[i] and end <= starts[i] |
| // This means the current element is before the current one. However, one special |
| // case is if end == starts[i] and starts[i] == ends[i]. In that case, |
| // the new element entirely overwrite the current one and we can just overwrite |
| if (endCmp == 0 && comparator.compare(starts[i], ends[i]) == 0) |
| setInternal(i, start, end, markedAt, delTime); |
| else |
| addInternal(i, start, end, markedAt, delTime); |
| return; |
| } |
| |
| // Do we overwrite the current element fully? |
| int cmp = comparator.compare(ends[i], end); |
| if (cmp <= 0) |
| { |
| // We do overwrite fully: |
| // update the current element until it's end and continue |
| // on with the next element (with the new inserted start == current end). |
| |
| // If we're on the last element, we can optimize |
| if (i == size-1) |
| { |
| setInternal(i, start, end, markedAt, delTime); |
| return; |
| } |
| |
| setInternal(i, start, ends[i], markedAt, delTime); |
| if (cmp == 0) |
| return; |
| |
| start = ends[i]; |
| i++; |
| } |
| else |
| { |
| // We don't ovewrite fully. Insert the new interval, and then update the now next |
| // one to reflect the not overwritten parts. We're then done. |
| addInternal(i, start, end, markedAt, delTime); |
| i++; |
| setInternal(i, end, ends[i], markedAts[i], delTimes[i]); |
| return; |
| } |
| } |
| else |
| { |
| // we don't overwrite the current element |
| |
| // If the new interval starts before the current one, insert that new interval |
| if (comparator.compare(start, starts[i]) < 0) |
| { |
| // If we stop before the start of the current element, just insert the new |
| // interval and we're done; otherwise insert until the beginning of the |
| // current element |
| if (comparator.compare(end, starts[i]) <= 0) |
| { |
| addInternal(i, start, end, markedAt, delTime); |
| return; |
| } |
| addInternal(i, start, starts[i], markedAt, delTime); |
| i++; |
| } |
| |
| // After that, we're overwritten on the current element but might have |
| // some residual parts after ... |
| |
| // ... unless we don't extend beyond it. |
| if (comparator.compare(end, ends[i]) <= 0) |
| return; |
| |
| start = ends[i]; |
| i++; |
| } |
| } |
| |
| // If we got there, then just insert the remainder at the end |
| addInternal(i, start, end, markedAt, delTime); |
| } |
| |
| private int capacity() |
| { |
| return starts.length; |
| } |
| |
| private void addInternal(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime) |
| { |
| assert i >= 0; |
| |
| if (size == capacity()) |
| growToFree(i); |
| else if (i < size) |
| moveElements(i); |
| |
| setInternal(i, start, end, markedAt, delTime); |
| size++; |
| } |
| |
| private void removeInternal(int i) |
| { |
| assert i >= 0; |
| |
| System.arraycopy(starts, i+1, starts, i, size - i - 1); |
| System.arraycopy(ends, i+1, ends, i, size - i - 1); |
| System.arraycopy(markedAts, i+1, markedAts, i, size - i - 1); |
| System.arraycopy(delTimes, i+1, delTimes, i, size - i - 1); |
| |
| --size; |
| starts[size] = null; |
| ends[size] = null; |
| } |
| |
| /* |
| * Grow the arrays, leaving index i "free" in the process. |
| */ |
| private void growToFree(int i) |
| { |
| int newLength = (capacity() * 3) / 2 + 1; |
| grow(i, newLength); |
| } |
| |
| /* |
| * Grow the arrays to match newLength capacity. |
| */ |
| private void grow(int newLength) |
| { |
| if (capacity() < newLength) |
| grow(-1, newLength); |
| } |
| |
| private void grow(int i, int newLength) |
| { |
| starts = grow(starts, size, newLength, i); |
| ends = grow(ends, size, newLength, i); |
| markedAts = grow(markedAts, size, newLength, i); |
| delTimes = grow(delTimes, size, newLength, i); |
| } |
| |
| private static LegacyBound[] grow(LegacyBound[] a, int size, int newLength, int i) |
| { |
| if (i < 0 || i >= size) |
| return Arrays.copyOf(a, newLength); |
| |
| LegacyBound[] newA = new LegacyBound[newLength]; |
| System.arraycopy(a, 0, newA, 0, i); |
| System.arraycopy(a, i, newA, i+1, size - i); |
| return newA; |
| } |
| |
| private static long[] grow(long[] a, int size, int newLength, int i) |
| { |
| if (i < 0 || i >= size) |
| return Arrays.copyOf(a, newLength); |
| |
| long[] newA = new long[newLength]; |
| System.arraycopy(a, 0, newA, 0, i); |
| System.arraycopy(a, i, newA, i+1, size - i); |
| return newA; |
| } |
| |
| private static int[] grow(int[] a, int size, int newLength, int i) |
| { |
| if (i < 0 || i >= size) |
| return Arrays.copyOf(a, newLength); |
| |
| int[] newA = new int[newLength]; |
| System.arraycopy(a, 0, newA, 0, i); |
| System.arraycopy(a, i, newA, i+1, size - i); |
| return newA; |
| } |
| |
| /* |
| * Move elements so that index i is "free", assuming the arrays have at least one free slot at the end. |
| */ |
| private void moveElements(int i) |
| { |
| if (i >= size) |
| return; |
| |
| System.arraycopy(starts, i, starts, i+1, size - i); |
| System.arraycopy(ends, i, ends, i+1, size - i); |
| System.arraycopy(markedAts, i, markedAts, i+1, size - i); |
| System.arraycopy(delTimes, i, delTimes, i+1, size - i); |
| // we set starts[i] to null to indicate the position is now empty, so that we update boundaryHeapSize |
| // when we set it |
| starts[i] = null; |
| } |
| |
| private void setInternal(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime) |
| { |
| starts[i] = start; |
| ends[i] = end; |
| markedAts[i] = markedAt; |
| delTimes[i] = delTime; |
| } |
| |
| public void updateDigest(MessageDigest digest) |
| { |
| ByteBuffer longBuffer = ByteBuffer.allocate(8); |
| for (int i = 0; i < size; i++) |
| { |
| for (int j = 0; j < starts[i].bound.size(); j++) |
| digest.update(starts[i].bound.get(j).duplicate()); |
| if (starts[i].collectionName != null) |
| digest.update(starts[i].collectionName.name.bytes.duplicate()); |
| for (int j = 0; j < ends[i].bound.size(); j++) |
| digest.update(ends[i].bound.get(j).duplicate()); |
| if (ends[i].collectionName != null) |
| digest.update(ends[i].collectionName.name.bytes.duplicate()); |
| |
| longBuffer.putLong(0, markedAts[i]); |
| digest.update(longBuffer.array(), 0, 8); |
| } |
| } |
| |
| public void serialize(DataOutputPlus out, CFMetaData metadata) throws IOException |
| { |
| out.writeInt(size); |
| if (size == 0) |
| return; |
| |
| if (metadata.isCompound()) |
| serializeCompound(out, metadata.isDense()); |
| else |
| serializeSimple(out); |
| } |
| |
| private void serializeCompound(DataOutputPlus out, boolean isDense) throws IOException |
| { |
| List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); |
| |
| if (!isDense) |
| types.add(UTF8Type.instance); |
| |
| CompositeType type = CompositeType.getInstance(types); |
| |
| for (int i = 0; i < size; i++) |
| { |
| LegacyBound start = starts[i]; |
| LegacyBound end = ends[i]; |
| |
| CompositeType.Builder startBuilder = type.builder(start.isStatic); |
| CompositeType.Builder endBuilder = type.builder(end.isStatic); |
| for (int j = 0; j < start.bound.clustering().size(); j++) |
| { |
| startBuilder.add(start.bound.get(j)); |
| endBuilder.add(end.bound.get(j)); |
| } |
| |
| if (start.collectionName != null) |
| startBuilder.add(start.collectionName.name.bytes); |
| if (end.collectionName != null) |
| endBuilder.add(end.collectionName.name.bytes); |
| |
| ByteBufferUtil.writeWithShortLength(startBuilder.build(), out); |
| ByteBufferUtil.writeWithShortLength(endBuilder.buildAsEndOfRange(), out); |
| |
| out.writeInt(delTimes[i]); |
| out.writeLong(markedAts[i]); |
| } |
| } |
| |
| private void serializeSimple(DataOutputPlus out) throws IOException |
| { |
| List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); |
| assert types.size() == 1 : types; |
| |
| for (int i = 0; i < size; i++) |
| { |
| LegacyBound start = starts[i]; |
| LegacyBound end = ends[i]; |
| |
| ClusteringPrefix startClustering = start.bound.clustering(); |
| ClusteringPrefix endClustering = end.bound.clustering(); |
| |
| assert startClustering.size() == 1; |
| assert endClustering.size() == 1; |
| |
| ByteBufferUtil.writeWithShortLength(startClustering.get(0), out); |
| ByteBufferUtil.writeWithShortLength(endClustering.get(0), out); |
| |
| out.writeInt(delTimes[i]); |
| out.writeLong(markedAts[i]); |
| } |
| } |
| |
| public long serializedSize(CFMetaData metadata) |
| { |
| long size = 0; |
| size += TypeSizes.sizeof(this.size); |
| |
| if (this.size == 0) |
| return size; |
| |
| if (metadata.isCompound()) |
| return size + serializedSizeCompound(metadata.isDense()); |
| else |
| return size + serializedSizeSimple(); |
| } |
| |
| private long serializedSizeCompound(boolean isDense) |
| { |
| long size = 0; |
| List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); |
| if (!isDense) |
| types.add(UTF8Type.instance); |
| CompositeType type = CompositeType.getInstance(types); |
| |
| for (int i = 0; i < this.size; i++) |
| { |
| LegacyBound start = starts[i]; |
| LegacyBound end = ends[i]; |
| |
| CompositeType.Builder startBuilder = type.builder(); |
| CompositeType.Builder endBuilder = type.builder(); |
| for (int j = 0; j < start.bound.size(); j++) |
| startBuilder.add(start.bound.get(j)); |
| for (int j = 0; j < end.bound.size(); j++) |
| endBuilder.add(end.bound.get(j)); |
| |
| if (start.collectionName != null) |
| startBuilder.add(start.collectionName.name.bytes); |
| if (end.collectionName != null) |
| endBuilder.add(end.collectionName.name.bytes); |
| |
| size += ByteBufferUtil.serializedSizeWithShortLength(startBuilder.build()); |
| size += ByteBufferUtil.serializedSizeWithShortLength(endBuilder.buildAsEndOfRange()); |
| |
| size += TypeSizes.sizeof(delTimes[i]); |
| size += TypeSizes.sizeof(markedAts[i]); |
| } |
| return size; |
| } |
| |
| private long serializedSizeSimple() |
| { |
| long size = 0; |
| List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); |
| assert types.size() == 1 : types; |
| |
| for (int i = 0; i < this.size; i++) |
| { |
| LegacyBound start = starts[i]; |
| LegacyBound end = ends[i]; |
| |
| ClusteringPrefix startClustering = start.bound.clustering(); |
| ClusteringPrefix endClustering = end.bound.clustering(); |
| |
| assert startClustering.size() == 1; |
| assert endClustering.size() == 1; |
| |
| size += ByteBufferUtil.serializedSizeWithShortLength(startClustering.get(0)); |
| size += ByteBufferUtil.serializedSizeWithShortLength(endClustering.get(0)); |
| |
| size += TypeSizes.sizeof(delTimes[i]); |
| size += TypeSizes.sizeof(markedAts[i]); |
| } |
| return size; |
| } |
| } |
| } |