blob: d4a7737ff45a82b043bcb1b8dc5e1815d5867258 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.function.Predicate;
import javax.annotation.Nullable;
import com.google.common.collect.Lists;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.config.*;
import org.apache.cassandra.cql3.Operator;
import org.apache.cassandra.db.filter.*;
import org.apache.cassandra.db.monitoring.ApproximateTime;
import org.apache.cassandra.db.monitoring.MonitorableImpl;
import org.apache.cassandra.db.partitions.*;
import org.apache.cassandra.db.rows.*;
import org.apache.cassandra.db.transform.RTBoundCloser;
import org.apache.cassandra.db.transform.RTBoundValidator;
import org.apache.cassandra.db.transform.RTBoundValidator.Stage;
import org.apache.cassandra.db.transform.StoppingTransformation;
import org.apache.cassandra.db.transform.Transformation;
import org.apache.cassandra.dht.AbstractBounds;
import org.apache.cassandra.index.Index;
import org.apache.cassandra.index.IndexNotAvailableException;
import org.apache.cassandra.io.ForwardingVersionedSerializer;
import org.apache.cassandra.io.IVersionedSerializer;
import org.apache.cassandra.io.util.DataInputPlus;
import org.apache.cassandra.io.util.DataOutputPlus;
import org.apache.cassandra.metrics.TableMetrics;
import org.apache.cassandra.net.MessageOut;
import org.apache.cassandra.net.MessagingService;
import org.apache.cassandra.schema.IndexMetadata;
import org.apache.cassandra.schema.UnknownIndexException;
import org.apache.cassandra.service.ClientWarn;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.Pair;
/**
* General interface for storage-engine read commands (common to both range and
* single partition commands).
* <p>
* This contains all the informations needed to do a local read.
*/
public abstract class ReadCommand extends MonitorableImpl implements ReadQuery
{
private static final int TEST_ITERATION_DELAY_MILLIS = Integer.parseInt(System.getProperty("cassandra.test.read_iteration_delay_ms", "0"));
protected static final Logger logger = LoggerFactory.getLogger(ReadCommand.class);
public static final IVersionedSerializer<ReadCommand> serializer = new Serializer();
// For READ verb: will either dispatch on 'serializer' for 3.0 or 'legacyReadCommandSerializer' for earlier version.
// Can be removed (and replaced by 'serializer') once we drop pre-3.0 backward compatibility.
public static final IVersionedSerializer<ReadCommand> readSerializer = new ForwardingVersionedSerializer<ReadCommand>()
{
protected IVersionedSerializer<ReadCommand> delegate(int version)
{
return version < MessagingService.VERSION_30
? legacyReadCommandSerializer : serializer;
}
};
// For RANGE_SLICE verb: will either dispatch on 'serializer' for 3.0 or 'legacyRangeSliceCommandSerializer' for earlier version.
// Can be removed (and replaced by 'serializer') once we drop pre-3.0 backward compatibility.
public static final IVersionedSerializer<ReadCommand> rangeSliceSerializer = new ForwardingVersionedSerializer<ReadCommand>()
{
protected IVersionedSerializer<ReadCommand> delegate(int version)
{
return version < MessagingService.VERSION_30
? legacyRangeSliceCommandSerializer : serializer;
}
};
// For PAGED_RANGE verb: will either dispatch on 'serializer' for 3.0 or 'legacyPagedRangeCommandSerializer' for earlier version.
// Can be removed (and replaced by 'serializer') once we drop pre-3.0 backward compatibility.
public static final IVersionedSerializer<ReadCommand> pagedRangeSerializer = new ForwardingVersionedSerializer<ReadCommand>()
{
protected IVersionedSerializer<ReadCommand> delegate(int version)
{
return version < MessagingService.VERSION_30
? legacyPagedRangeCommandSerializer : serializer;
}
};
public static final IVersionedSerializer<ReadCommand> legacyRangeSliceCommandSerializer = new LegacyRangeSliceCommandSerializer();
public static final IVersionedSerializer<ReadCommand> legacyPagedRangeCommandSerializer = new LegacyPagedRangeCommandSerializer();
public static final IVersionedSerializer<ReadCommand> legacyReadCommandSerializer = new LegacyReadCommandSerializer();
private final Kind kind;
private final CFMetaData metadata;
private final int nowInSec;
private final ColumnFilter columnFilter;
private final RowFilter rowFilter;
private final DataLimits limits;
private final boolean isDigestQuery;
// if a digest query, the version for which the digest is expected. Ignored if not a digest.
private int digestVersion;
private final boolean isForThrift;
@Nullable
private final IndexMetadata index;
protected static abstract class SelectionDeserializer
{
public abstract ReadCommand deserialize(DataInputPlus in,
int version,
boolean isDigest,
int digestVersion,
boolean isForThrift,
CFMetaData metadata,
int nowInSec,
ColumnFilter columnFilter,
RowFilter rowFilter,
DataLimits limits,
IndexMetadata index) throws IOException;
}
protected enum Kind
{
SINGLE_PARTITION (SinglePartitionReadCommand.selectionDeserializer),
PARTITION_RANGE (PartitionRangeReadCommand.selectionDeserializer);
private final SelectionDeserializer selectionDeserializer;
Kind(SelectionDeserializer selectionDeserializer)
{
this.selectionDeserializer = selectionDeserializer;
}
}
protected ReadCommand(Kind kind,
boolean isDigestQuery,
int digestVersion,
boolean isForThrift,
CFMetaData metadata,
int nowInSec,
ColumnFilter columnFilter,
RowFilter rowFilter,
DataLimits limits,
IndexMetadata index)
{
this.kind = kind;
this.isDigestQuery = isDigestQuery;
this.digestVersion = digestVersion;
this.isForThrift = isForThrift;
this.metadata = metadata;
this.nowInSec = nowInSec;
this.columnFilter = columnFilter;
this.rowFilter = rowFilter;
this.limits = limits;
this.index = index;
}
protected abstract void serializeSelection(DataOutputPlus out, int version) throws IOException;
protected abstract long selectionSerializedSize(int version);
public abstract boolean isLimitedToOnePartition();
/**
* Creates a new <code>ReadCommand</code> instance with new limits.
*
* @param newLimits the new limits
* @return a new <code>ReadCommand</code> with the updated limits
*/
public abstract ReadCommand withUpdatedLimit(DataLimits newLimits);
/**
* The metadata for the table queried.
*
* @return the metadata for the table queried.
*/
public CFMetaData metadata()
{
return metadata;
}
/**
* The time in seconds to use as "now" for this query.
* <p>
* We use the same time as "now" for the whole query to avoid considering different
* values as expired during the query, which would be buggy (would throw of counting amongst other
* things).
*
* @return the time (in seconds) to use as "now".
*/
public int nowInSec()
{
return nowInSec;
}
/**
* The configured timeout for this command.
*
* @return the configured timeout for this command.
*/
public abstract long getTimeout();
/**
* A filter on which (non-PK) columns must be returned by the query.
*
* @return which columns must be fetched by this query.
*/
public ColumnFilter columnFilter()
{
return columnFilter;
}
/**
* Filters/Resrictions on CQL rows.
* <p>
* This contains the restrictions that are not directly handled by the
* {@code ClusteringIndexFilter}. More specifically, this includes any non-PK column
* restrictions and can include some PK columns restrictions when those can't be
* satisfied entirely by the clustering index filter (because not all clustering columns
* have been restricted for instance). If there is 2ndary indexes on the table,
* one of this restriction might be handled by a 2ndary index.
*
* @return the filter holding the expression that rows must satisfy.
*/
public RowFilter rowFilter()
{
return rowFilter;
}
/**
* The limits set on this query.
*
* @return the limits set on this query.
*/
public DataLimits limits()
{
return limits;
}
/**
* Whether this query is a digest one or not.
*
* @return Whether this query is a digest query.
*/
public boolean isDigestQuery()
{
return isDigestQuery;
}
/**
* If the query is a digest one, the requested digest version.
*
* @return the requested digest version if the query is a digest. Otherwise, this can return
* anything.
*/
public int digestVersion()
{
return digestVersion;
}
/**
* Sets the digest version, for when digest for that command is requested.
* <p>
* Note that we allow setting this independently of setting the command as a digest query as
* this allows us to use the command as a carrier of the digest version even if we only call
* setIsDigestQuery on some copy of it.
*
* @param digestVersion the version for the digest is this command is used for digest query..
* @return this read command.
*/
public ReadCommand setDigestVersion(int digestVersion)
{
this.digestVersion = digestVersion;
return this;
}
/**
* Whether this query is for thrift or not.
*
* @return whether this query is for thrift.
*/
public boolean isForThrift()
{
return isForThrift;
}
/**
* Index (metadata) chosen for this query. Can be null.
*
* @return index (metadata) chosen for this query
*/
@Nullable
public IndexMetadata indexMetadata()
{
return index;
}
/**
* The clustering index filter this command to use for the provided key.
* <p>
* Note that that method should only be called on a key actually queried by this command
* and in practice, this will almost always return the same filter, but for the sake of
* paging, the filter on the first key of a range command might be slightly different.
*
* @param key a partition key queried by this command.
*
* @return the {@code ClusteringIndexFilter} to use for the partition of key {@code key}.
*/
public abstract ClusteringIndexFilter clusteringIndexFilter(DecoratedKey key);
/**
* Returns a copy of this command.
*
* @return a copy of this command.
*/
public abstract ReadCommand copy();
/**
* Returns a copy of this command with isDigestQuery set to true.
*/
public abstract ReadCommand copyAsDigestQuery();
protected abstract UnfilteredPartitionIterator queryStorage(ColumnFamilyStore cfs, ReadExecutionController executionController);
protected abstract int oldestUnrepairedTombstone();
/**
* Whether the underlying {@code ClusteringIndexFilter} is reversed or not.
*
* @return whether the underlying {@code ClusteringIndexFilter} is reversed or not.
*/
public abstract boolean isReversed();
public ReadResponse createResponse(UnfilteredPartitionIterator iterator)
{
// validate that the sequence of RT markers is correct: open is followed by close, deletion times for both
// ends equal, and there are no dangling RT bound in any partition.
iterator = RTBoundValidator.validate(iterator, Stage.PROCESSED, true);
return isDigestQuery()
? ReadResponse.createDigestResponse(iterator, this)
: ReadResponse.createDataResponse(iterator, this);
}
long indexSerializedSize(int version)
{
return null != index
? IndexMetadata.serializer.serializedSize(index, version)
: 0;
}
public Index getIndex(ColumnFamilyStore cfs)
{
return null != index
? cfs.indexManager.getIndex(index)
: null;
}
static IndexMetadata findIndex(CFMetaData table, RowFilter rowFilter)
{
if (table.getIndexes().isEmpty() || rowFilter.isEmpty())
return null;
ColumnFamilyStore cfs = Keyspace.openAndGetStore(table);
Index index = cfs.indexManager.getBestIndexFor(rowFilter);
return null != index
? index.getIndexMetadata()
: null;
}
/**
* If the index manager for the CFS determines that there's an applicable
* 2i that can be used to execute this command, call its (optional)
* validation method to check that nothing in this command's parameters
* violates the implementation specific validation rules.
*/
public void maybeValidateIndex()
{
Index index = getIndex(Keyspace.openAndGetStore(metadata));
if (null != index)
index.validate(this);
}
/**
* Executes this command on the local host.
*
* @param executionController the execution controller spanning this command
*
* @return an iterator over the result of executing this command locally.
*/
@SuppressWarnings("resource") // The result iterator is closed upon exceptions (we know it's fine to potentially not close the intermediary
// iterators created inside the try as long as we do close the original resultIterator), or by closing the result.
public UnfilteredPartitionIterator executeLocally(ReadExecutionController executionController)
{
long startTimeNanos = System.nanoTime();
ColumnFamilyStore cfs = Keyspace.openAndGetStore(metadata());
Index index = getIndex(cfs);
Index.Searcher searcher = null;
if (index != null)
{
if (!cfs.indexManager.isIndexQueryable(index))
throw new IndexNotAvailableException(index);
searcher = index.searcherFor(this);
Tracing.trace("Executing read on {}.{} using index {}", cfs.metadata.ksName, cfs.metadata.cfName, index.getIndexMetadata().name);
}
UnfilteredPartitionIterator iterator = (null == searcher) ? queryStorage(cfs, executionController) : searcher.search(executionController);
iterator = RTBoundValidator.validate(iterator, Stage.MERGED, false);
try
{
iterator = withStateTracking(iterator);
iterator = RTBoundValidator.validate(withoutPurgeableTombstones(iterator, cfs), Stage.PURGED, false);
iterator = withMetricsRecording(iterator, cfs.metric, startTimeNanos);
// If we've used a 2ndary index, we know the result already satisfy the primary expression used, so
// no point in checking it again.
RowFilter filter = (null == searcher) ? rowFilter() : index.getPostIndexQueryFilter(rowFilter());
/*
* TODO: We'll currently do filtering by the rowFilter here because it's convenient. However,
* we'll probably want to optimize by pushing it down the layer (like for dropped columns) as it
* would be more efficient (the sooner we discard stuff we know we don't care, the less useless
* processing we do on it).
*/
iterator = filter.filter(iterator, nowInSec());
// apply the limits/row counter; this transformation is stopping and would close the iterator as soon
// as the count is observed; if that happens in the middle of an open RT, its end bound will not be included.
iterator = limits().filter(iterator, nowInSec(), selectsFullPartition());
// because of the above, we need to append an aritifical end bound if the source iterator was stopped short by a counter.
return RTBoundCloser.close(iterator);
}
catch (RuntimeException | Error e)
{
iterator.close();
throw e;
}
}
protected abstract void recordLatency(TableMetrics metric, long latencyNanos);
public PartitionIterator executeInternal(ReadExecutionController controller)
{
return UnfilteredPartitionIterators.filter(executeLocally(controller), nowInSec());
}
public ReadExecutionController executionController()
{
return ReadExecutionController.forCommand(this);
}
/**
* Wraps the provided iterator so that metrics on what is scanned by the command are recorded.
* This also log warning/trow TombstoneOverwhelmingException if appropriate.
*/
private UnfilteredPartitionIterator withMetricsRecording(UnfilteredPartitionIterator iter, final TableMetrics metric, final long startTimeNanos)
{
class MetricRecording extends Transformation<UnfilteredRowIterator>
{
private final int failureThreshold = DatabaseDescriptor.getTombstoneFailureThreshold();
private final int warningThreshold = DatabaseDescriptor.getTombstoneWarnThreshold();
private final boolean respectTombstoneThresholds = !SchemaConstants.isLocalSystemKeyspace(ReadCommand.this.metadata().ksName);
private final boolean enforceStrictLiveness = metadata.enforceStrictLiveness();
private int liveRows = 0;
private int tombstones = 0;
private DecoratedKey currentKey;
@Override
public UnfilteredRowIterator applyToPartition(UnfilteredRowIterator iter)
{
currentKey = iter.partitionKey();
return Transformation.apply(iter, this);
}
@Override
public Row applyToStatic(Row row)
{
return applyToRow(row);
}
/**
* Count the number of live rows returned by the read command and the number of tombstones.
*
* Tombstones come in two forms on rows :
* - cells that aren't live anymore (either expired through TTL or deleted) : 1 tombstone per cell
* - Rows that aren't live and have no cell (DELETEs performed on the primary key) : 1 tombstone per row
* We avoid counting rows as tombstones if they contain nothing but expired cells.
*/
@Override
public Row applyToRow(Row row)
{
boolean hasTombstones = false;
for (Cell cell : row.cells())
{
if (!cell.isLive(ReadCommand.this.nowInSec()))
{
countTombstone(row.clustering());
hasTombstones = true; // allows to avoid counting an extra tombstone if the whole row expired
}
}
if (row.hasLiveData(ReadCommand.this.nowInSec(), enforceStrictLiveness))
++liveRows;
else if (!row.primaryKeyLivenessInfo().isLive(ReadCommand.this.nowInSec())
&& row.hasDeletion(ReadCommand.this.nowInSec())
&& !hasTombstones)
{
// We're counting primary key deletions only here.
countTombstone(row.clustering());
}
return row;
}
@Override
public RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
{
countTombstone(marker.clustering());
return marker;
}
private void countTombstone(ClusteringPrefix clustering)
{
++tombstones;
if (tombstones > failureThreshold && respectTombstoneThresholds)
{
String query = ReadCommand.this.toCQLString();
Tracing.trace("Scanned over {} tombstones for query {}; query aborted (see tombstone_failure_threshold)", failureThreshold, query);
throw new TombstoneOverwhelmingException(tombstones, query, ReadCommand.this.metadata(), currentKey, clustering);
}
}
@Override
public void onClose()
{
recordLatency(metric, System.nanoTime() - startTimeNanos);
metric.tombstoneScannedHistogram.update(tombstones);
metric.liveScannedHistogram.update(liveRows);
boolean warnTombstones = tombstones > warningThreshold && respectTombstoneThresholds;
if (warnTombstones)
{
String msg = String.format(
"Read %d live rows and %d tombstone cells for query %1.512s; token %s (see tombstone_warn_threshold)",
liveRows, tombstones, ReadCommand.this.toCQLString(), currentKey.getToken());
ClientWarn.instance.warn(msg);
logger.warn(msg);
}
Tracing.trace("Read {} live rows and {} tombstone cells{}",
liveRows, tombstones,
(warnTombstones ? " (see tombstone_warn_threshold)" : ""));
}
};
return Transformation.apply(iter, new MetricRecording());
}
protected class CheckForAbort extends StoppingTransformation<UnfilteredRowIterator>
{
long lastChecked = 0;
protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition)
{
if (maybeAbort())
{
partition.close();
return null;
}
return Transformation.apply(partition, this);
}
protected Row applyToRow(Row row)
{
if (TEST_ITERATION_DELAY_MILLIS > 0)
maybeDelayForTesting();
return maybeAbort() ? null : row;
}
private boolean maybeAbort()
{
/**
* The value returned by ApproximateTime.currentTimeMillis() is updated only every
* {@link ApproximateTime.CHECK_INTERVAL_MS}, by default 10 millis. Since MonitorableImpl
* relies on ApproximateTime, we don't need to check unless the approximate time has elapsed.
*/
if (lastChecked == ApproximateTime.currentTimeMillis())
return false;
lastChecked = ApproximateTime.currentTimeMillis();
if (isAborted())
{
stop();
return true;
}
return false;
}
private void maybeDelayForTesting()
{
if (!metadata.ksName.startsWith("system"))
FBUtilities.sleepQuietly(TEST_ITERATION_DELAY_MILLIS);
}
}
protected UnfilteredPartitionIterator withStateTracking(UnfilteredPartitionIterator iter)
{
return Transformation.apply(iter, new CheckForAbort());
}
/**
* Creates a message for this command.
*/
public abstract MessageOut<ReadCommand> createMessage(int version);
protected abstract void appendCQLWhereClause(StringBuilder sb);
// Skip purgeable tombstones. We do this because it's safe to do (post-merge of the memtable and sstable at least), it
// can save us some bandwith, and avoid making us throw a TombstoneOverwhelmingException for purgeable tombstones (which
// are to some extend an artefact of compaction lagging behind and hence counting them is somewhat unintuitive).
protected UnfilteredPartitionIterator withoutPurgeableTombstones(UnfilteredPartitionIterator iterator, ColumnFamilyStore cfs)
{
final boolean isForThrift = iterator.isForThrift();
class WithoutPurgeableTombstones extends PurgeFunction
{
public WithoutPurgeableTombstones()
{
super(isForThrift,
nowInSec(),
cfs.gcBefore(nowInSec()),
oldestUnrepairedTombstone(),
cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones(),
cfs.metadata.enforceStrictLiveness());
}
protected Predicate<Long> getPurgeEvaluator()
{
return time -> true;
}
}
return Transformation.apply(iterator, new WithoutPurgeableTombstones());
}
/**
* Recreate the CQL string corresponding to this query.
* <p>
* Note that in general the returned string will not be exactly the original user string, first
* because there isn't always a single syntax for a given query, but also because we don't have
* all the information needed (we know the non-PK columns queried but not the PK ones as internally
* we query them all). So this shouldn't be relied too strongly, but this should be good enough for
* debugging purpose which is what this is for.
*/
public String toCQLString()
{
StringBuilder sb = new StringBuilder();
sb.append("SELECT ").append(columnFilter());
sb.append(" FROM ").append(metadata().ksName).append('.').append(metadata.cfName);
appendCQLWhereClause(sb);
if (limits() != DataLimits.NONE)
sb.append(' ').append(limits());
return sb.toString();
}
// Monitorable interface
public String name()
{
return toCQLString();
}
private static class Serializer implements IVersionedSerializer<ReadCommand>
{
private static int digestFlag(boolean isDigest)
{
return isDigest ? 0x01 : 0;
}
private static boolean isDigest(int flags)
{
return (flags & 0x01) != 0;
}
private static int thriftFlag(boolean isForThrift)
{
return isForThrift ? 0x02 : 0;
}
private static boolean isForThrift(int flags)
{
return (flags & 0x02) != 0;
}
private static int indexFlag(boolean hasIndex)
{
return hasIndex ? 0x04 : 0;
}
private static boolean hasIndex(int flags)
{
return (flags & 0x04) != 0;
}
public void serialize(ReadCommand command, DataOutputPlus out, int version) throws IOException
{
assert version >= MessagingService.VERSION_30;
out.writeByte(command.kind.ordinal());
out.writeByte(digestFlag(command.isDigestQuery()) | thriftFlag(command.isForThrift()) | indexFlag(null != command.index));
if (command.isDigestQuery())
out.writeUnsignedVInt(command.digestVersion());
CFMetaData.serializer.serialize(command.metadata(), out, version);
out.writeInt(command.nowInSec());
ColumnFilter.serializer.serialize(command.columnFilter(), out, version);
RowFilter.serializer.serialize(command.rowFilter(), out, version);
DataLimits.serializer.serialize(command.limits(), out, version, command.metadata.comparator);
if (null != command.index)
IndexMetadata.serializer.serialize(command.index, out, version);
command.serializeSelection(out, version);
}
public ReadCommand deserialize(DataInputPlus in, int version) throws IOException
{
assert version >= MessagingService.VERSION_30;
Kind kind = Kind.values()[in.readByte()];
int flags = in.readByte();
boolean isDigest = isDigest(flags);
boolean isForThrift = isForThrift(flags);
boolean hasIndex = hasIndex(flags);
int digestVersion = isDigest ? (int)in.readUnsignedVInt() : 0;
CFMetaData metadata = CFMetaData.serializer.deserialize(in, version);
int nowInSec = in.readInt();
ColumnFilter columnFilter = ColumnFilter.serializer.deserialize(in, version, metadata);
RowFilter rowFilter = RowFilter.serializer.deserialize(in, version, metadata);
DataLimits limits = DataLimits.serializer.deserialize(in, version, metadata.comparator);
IndexMetadata index = hasIndex ? deserializeIndexMetadata(in, version, metadata) : null;
return kind.selectionDeserializer.deserialize(in, version, isDigest, digestVersion, isForThrift, metadata, nowInSec, columnFilter, rowFilter, limits, index);
}
private IndexMetadata deserializeIndexMetadata(DataInputPlus in, int version, CFMetaData cfm) throws IOException
{
try
{
return IndexMetadata.serializer.deserialize(in, version, cfm);
}
catch (UnknownIndexException e)
{
logger.info("Couldn't find a defined index on {}.{} with the id {}. " +
"If an index was just created, this is likely due to the schema not " +
"being fully propagated. Local read will proceed without using the " +
"index. Please wait for schema agreement after index creation.",
cfm.ksName, cfm.cfName, e.indexId);
return null;
}
}
public long serializedSize(ReadCommand command, int version)
{
assert version >= MessagingService.VERSION_30;
return 2 // kind + flags
+ (command.isDigestQuery() ? TypeSizes.sizeofUnsignedVInt(command.digestVersion()) : 0)
+ CFMetaData.serializer.serializedSize(command.metadata(), version)
+ TypeSizes.sizeof(command.nowInSec())
+ ColumnFilter.serializer.serializedSize(command.columnFilter(), version)
+ RowFilter.serializer.serializedSize(command.rowFilter(), version)
+ DataLimits.serializer.serializedSize(command.limits(), version, command.metadata.comparator)
+ command.selectionSerializedSize(version)
+ command.indexSerializedSize(version);
}
}
private enum LegacyType
{
GET_BY_NAMES((byte)1),
GET_SLICES((byte)2);
public final byte serializedValue;
LegacyType(byte b)
{
this.serializedValue = b;
}
public static LegacyType fromPartitionFilterKind(ClusteringIndexFilter.Kind kind)
{
return kind == ClusteringIndexFilter.Kind.SLICE
? GET_SLICES
: GET_BY_NAMES;
}
public static LegacyType fromSerializedValue(byte b)
{
return b == 1 ? GET_BY_NAMES : GET_SLICES;
}
}
/**
* Serializer for pre-3.0 RangeSliceCommands.
*/
private static class LegacyRangeSliceCommandSerializer implements IVersionedSerializer<ReadCommand>
{
public void serialize(ReadCommand command, DataOutputPlus out, int version) throws IOException
{
assert version < MessagingService.VERSION_30;
PartitionRangeReadCommand rangeCommand = (PartitionRangeReadCommand) command;
assert !rangeCommand.dataRange().isPaging();
// convert pre-3.0 incompatible names filters to slice filters
rangeCommand = maybeConvertNamesToSlice(rangeCommand);
CFMetaData metadata = rangeCommand.metadata();
out.writeUTF(metadata.ksName);
out.writeUTF(metadata.cfName);
out.writeLong(rangeCommand.nowInSec() * 1000L); // convert from seconds to millis
// begin DiskAtomFilterSerializer.serialize()
if (rangeCommand.isNamesQuery())
{
out.writeByte(1); // 0 for slices, 1 for names
ClusteringIndexNamesFilter filter = (ClusteringIndexNamesFilter) rangeCommand.dataRange().clusteringIndexFilter;
LegacyReadCommandSerializer.serializeNamesFilter(rangeCommand, filter, out);
}
else
{
out.writeByte(0); // 0 for slices, 1 for names
// slice filter serialization
ClusteringIndexSliceFilter filter = (ClusteringIndexSliceFilter) rangeCommand.dataRange().clusteringIndexFilter;
boolean makeStaticSlice = !rangeCommand.columnFilter().fetchedColumns().statics.isEmpty() && !filter.requestedSlices().selects(Clustering.STATIC_CLUSTERING);
LegacyReadCommandSerializer.serializeSlices(out, filter.requestedSlices(), filter.isReversed(), makeStaticSlice, metadata);
out.writeBoolean(filter.isReversed());
// limit
DataLimits limits = rangeCommand.limits();
if (limits.isDistinct())
out.writeInt(1);
else
out.writeInt(LegacyReadCommandSerializer.updateLimitForQuery(rangeCommand.limits().count(), filter.requestedSlices()));
int compositesToGroup;
boolean selectsStatics = !rangeCommand.columnFilter().fetchedColumns().statics.isEmpty() && filter.requestedSlices().selects(Clustering.STATIC_CLUSTERING);
if (limits.kind() == DataLimits.Kind.THRIFT_LIMIT)
compositesToGroup = -1;
else if (limits.isDistinct() && !selectsStatics)
compositesToGroup = -2; // for DISTINCT queries (CASSANDRA-8490)
else
compositesToGroup = metadata.isDense() ? -1 : metadata.clusteringColumns().size();
out.writeInt(compositesToGroup);
}
serializeRowFilter(out, rangeCommand.rowFilter());
AbstractBounds.rowPositionSerializer.serialize(rangeCommand.dataRange().keyRange(), out, version);
// maxResults
out.writeInt(rangeCommand.limits().count());
// countCQL3Rows
if (rangeCommand.isForThrift() || rangeCommand.limits().perPartitionCount() == 1) // if for Thrift or DISTINCT
out.writeBoolean(false);
else
out.writeBoolean(true);
// isPaging
out.writeBoolean(false);
}
public ReadCommand deserialize(DataInputPlus in, int version) throws IOException
{
assert version < MessagingService.VERSION_30;
String keyspace = in.readUTF();
String columnFamily = in.readUTF();
CFMetaData metadata = Schema.instance.getCFMetaData(keyspace, columnFamily);
if (metadata == null)
{
String message = String.format("Got legacy range command for nonexistent table %s.%s.", keyspace, columnFamily);
throw new UnknownColumnFamilyException(message, null);
}
int nowInSec = (int) (in.readLong() / 1000); // convert from millis to seconds
ClusteringIndexFilter filter;
ColumnFilter selection;
int compositesToGroup = 0;
int perPartitionLimit = -1;
byte readType = in.readByte(); // 0 for slices, 1 for names
if (readType == 1)
{
Pair<ColumnFilter, ClusteringIndexNamesFilter> selectionAndFilter = LegacyReadCommandSerializer.deserializeNamesSelectionAndFilter(in, metadata);
selection = selectionAndFilter.left;
filter = selectionAndFilter.right;
}
else
{
Pair<ClusteringIndexSliceFilter, Boolean> p = LegacyReadCommandSerializer.deserializeSlicePartitionFilter(in, metadata);
filter = p.left;
perPartitionLimit = in.readInt();
compositesToGroup = in.readInt();
selection = getColumnSelectionForSlice(p.right, compositesToGroup, metadata);
}
RowFilter rowFilter = deserializeRowFilter(in, metadata);
AbstractBounds<PartitionPosition> keyRange = AbstractBounds.rowPositionSerializer.deserialize(in, metadata.partitioner, version);
int maxResults = in.readInt();
boolean countCQL3Rows = in.readBoolean(); // countCQL3Rows (not needed)
in.readBoolean(); // isPaging (not needed)
boolean selectsStatics = (!selection.fetchedColumns().statics.isEmpty() || filter.selects(Clustering.STATIC_CLUSTERING));
// We have 2 types of DISTINCT queries: ones on only the partition key, and ones on the partition key and static columns. For the former,
// we can easily detect the case because compositeToGroup is -2 and that's the only case it can be that. The latter one is slightly less
// direct, but we know that on 2.1/2.2 queries, DISTINCT queries are the only CQL queries that have countCQL3Rows to false so we use
// that fact.
boolean isDistinct = compositesToGroup == -2 || (compositesToGroup != -1 && !countCQL3Rows);
DataLimits limits;
if (isDistinct)
limits = DataLimits.distinctLimits(maxResults);
else if (compositesToGroup == -1)
limits = DataLimits.thriftLimits(maxResults, perPartitionLimit);
else if (metadata.isStaticCompactTable())
limits = DataLimits.legacyCompactStaticCqlLimits(maxResults);
else
limits = DataLimits.cqlLimits(maxResults);
return PartitionRangeReadCommand.create(true, metadata, nowInSec, selection, rowFilter, limits, new DataRange(keyRange, filter));
}
static void serializeRowFilter(DataOutputPlus out, RowFilter rowFilter) throws IOException
{
ArrayList<RowFilter.Expression> indexExpressions = Lists.newArrayList(rowFilter.iterator());
out.writeInt(indexExpressions.size());
for (RowFilter.Expression expression : indexExpressions)
{
ByteBufferUtil.writeWithShortLength(expression.column().name.bytes, out);
expression.operator().writeTo(out);
ByteBufferUtil.writeWithShortLength(expression.getIndexValue(), out);
}
}
static RowFilter deserializeRowFilter(DataInputPlus in, CFMetaData metadata) throws IOException
{
int numRowFilters = in.readInt();
if (numRowFilters == 0)
return RowFilter.NONE;
RowFilter rowFilter = RowFilter.create(numRowFilters);
for (int i = 0; i < numRowFilters; i++)
{
ByteBuffer columnName = ByteBufferUtil.readWithShortLength(in);
ColumnDefinition column = metadata.getColumnDefinition(columnName);
Operator op = Operator.readFrom(in);
ByteBuffer indexValue = ByteBufferUtil.readWithShortLength(in);
rowFilter.add(column, op, indexValue);
}
return rowFilter;
}
static long serializedRowFilterSize(RowFilter rowFilter)
{
long size = TypeSizes.sizeof(0); // rowFilterCount
for (RowFilter.Expression expression : rowFilter)
{
size += ByteBufferUtil.serializedSizeWithShortLength(expression.column().name.bytes);
size += TypeSizes.sizeof(0); // operator int value
size += ByteBufferUtil.serializedSizeWithShortLength(expression.getIndexValue());
}
return size;
}
public long serializedSize(ReadCommand command, int version)
{
assert version < MessagingService.VERSION_30;
assert command.kind == Kind.PARTITION_RANGE;
PartitionRangeReadCommand rangeCommand = (PartitionRangeReadCommand) command;
rangeCommand = maybeConvertNamesToSlice(rangeCommand);
CFMetaData metadata = rangeCommand.metadata();
long size = TypeSizes.sizeof(metadata.ksName);
size += TypeSizes.sizeof(metadata.cfName);
size += TypeSizes.sizeof((long) rangeCommand.nowInSec());
size += 1; // single byte flag: 0 for slices, 1 for names
if (rangeCommand.isNamesQuery())
{
PartitionColumns columns = rangeCommand.columnFilter().fetchedColumns();
ClusteringIndexNamesFilter filter = (ClusteringIndexNamesFilter) rangeCommand.dataRange().clusteringIndexFilter;
size += LegacyReadCommandSerializer.serializedNamesFilterSize(filter, metadata, columns);
}
else
{
ClusteringIndexSliceFilter filter = (ClusteringIndexSliceFilter) rangeCommand.dataRange().clusteringIndexFilter;
boolean makeStaticSlice = !rangeCommand.columnFilter().fetchedColumns().statics.isEmpty() && !filter.requestedSlices().selects(Clustering.STATIC_CLUSTERING);
size += LegacyReadCommandSerializer.serializedSlicesSize(filter.requestedSlices(), makeStaticSlice, metadata);
size += TypeSizes.sizeof(filter.isReversed());
size += TypeSizes.sizeof(rangeCommand.limits().perPartitionCount());
size += TypeSizes.sizeof(0); // compositesToGroup
}
if (rangeCommand.rowFilter().equals(RowFilter.NONE))
{
size += TypeSizes.sizeof(0);
}
else
{
ArrayList<RowFilter.Expression> indexExpressions = Lists.newArrayList(rangeCommand.rowFilter().iterator());
size += TypeSizes.sizeof(indexExpressions.size());
for (RowFilter.Expression expression : indexExpressions)
{
size += ByteBufferUtil.serializedSizeWithShortLength(expression.column().name.bytes);
size += TypeSizes.sizeof(expression.operator().ordinal());
size += ByteBufferUtil.serializedSizeWithShortLength(expression.getIndexValue());
}
}
size += AbstractBounds.rowPositionSerializer.serializedSize(rangeCommand.dataRange().keyRange(), version);
size += TypeSizes.sizeof(rangeCommand.limits().count());
size += TypeSizes.sizeof(!rangeCommand.isForThrift());
return size + TypeSizes.sizeof(rangeCommand.dataRange().isPaging());
}
static PartitionRangeReadCommand maybeConvertNamesToSlice(PartitionRangeReadCommand command)
{
if (!command.dataRange().isNamesQuery())
return command;
CFMetaData metadata = command.metadata();
if (!LegacyReadCommandSerializer.shouldConvertNamesToSlice(metadata, command.columnFilter().fetchedColumns()))
return command;
ClusteringIndexNamesFilter filter = (ClusteringIndexNamesFilter) command.dataRange().clusteringIndexFilter;
ClusteringIndexSliceFilter sliceFilter = LegacyReadCommandSerializer.convertNamesFilterToSliceFilter(filter, metadata);
DataRange newRange = new DataRange(command.dataRange().keyRange(), sliceFilter);
return command.withUpdatedDataRange(newRange);
}
static ColumnFilter getColumnSelectionForSlice(boolean selectsStatics, int compositesToGroup, CFMetaData metadata)
{
// A value of -2 indicates this is a DISTINCT query that doesn't select static columns, only partition keys.
// In that case, we'll basically be querying the first row of the partition, but we must make sure we include
// all columns so we get at least one cell if there is a live row as it would confuse pre-3.0 nodes otherwise.
if (compositesToGroup == -2)
return ColumnFilter.all(metadata);
// if a slice query from a pre-3.0 node doesn't cover statics, we shouldn't select them at all
PartitionColumns columns = selectsStatics
? metadata.partitionColumns()
: metadata.partitionColumns().withoutStatics();
return ColumnFilter.selectionBuilder().addAll(columns).build();
}
}
/**
* Serializer for pre-3.0 PagedRangeCommands.
*/
private static class LegacyPagedRangeCommandSerializer implements IVersionedSerializer<ReadCommand>
{
public void serialize(ReadCommand command, DataOutputPlus out, int version) throws IOException
{
assert version < MessagingService.VERSION_30;
PartitionRangeReadCommand rangeCommand = (PartitionRangeReadCommand) command;
assert rangeCommand.dataRange().isPaging();
CFMetaData metadata = rangeCommand.metadata();
out.writeUTF(metadata.ksName);
out.writeUTF(metadata.cfName);
out.writeLong(rangeCommand.nowInSec() * 1000L); // convert from seconds to millis
AbstractBounds.rowPositionSerializer.serialize(rangeCommand.dataRange().keyRange(), out, version);
// pre-3.0 nodes don't accept names filters for paged range commands
ClusteringIndexSliceFilter filter;
if (rangeCommand.dataRange().clusteringIndexFilter.kind() == ClusteringIndexFilter.Kind.NAMES)
filter = LegacyReadCommandSerializer.convertNamesFilterToSliceFilter((ClusteringIndexNamesFilter) rangeCommand.dataRange().clusteringIndexFilter, metadata);
else
filter = (ClusteringIndexSliceFilter) rangeCommand.dataRange().clusteringIndexFilter;
// slice filter
boolean makeStaticSlice = !rangeCommand.columnFilter().fetchedColumns().statics.isEmpty() && !filter.requestedSlices().selects(Clustering.STATIC_CLUSTERING);
LegacyReadCommandSerializer.serializeSlices(out, filter.requestedSlices(), filter.isReversed(), makeStaticSlice, metadata);
out.writeBoolean(filter.isReversed());
// slice filter's count
DataLimits.Kind kind = rangeCommand.limits().kind();
boolean isDistinct = (kind == DataLimits.Kind.CQL_LIMIT || kind == DataLimits.Kind.CQL_PAGING_LIMIT) && rangeCommand.limits().perPartitionCount() == 1;
if (isDistinct)
out.writeInt(1);
else
out.writeInt(LegacyReadCommandSerializer.updateLimitForQuery(rangeCommand.limits().perPartitionCount(), filter.requestedSlices()));
// compositesToGroup
boolean selectsStatics = !rangeCommand.columnFilter().fetchedColumns().statics.isEmpty() || filter.requestedSlices().selects(Clustering.STATIC_CLUSTERING);
int compositesToGroup;
if (kind == DataLimits.Kind.THRIFT_LIMIT)
compositesToGroup = -1;
else if (isDistinct && !selectsStatics)
compositesToGroup = -2; // for DISTINCT queries (CASSANDRA-8490)
else
compositesToGroup = metadata.isDense() ? -1 : metadata.clusteringColumns().size();
out.writeInt(compositesToGroup);
// command-level "start" and "stop" composites. The start is the last-returned cell name if there is one,
// otherwise it's the same as the slice filter's start. The stop appears to always be the same as the
// slice filter's stop.
DataRange.Paging pagingRange = (DataRange.Paging) rangeCommand.dataRange();
Clustering lastReturned = pagingRange.getLastReturned();
ClusteringBound newStart = ClusteringBound.inclusiveStartOf(lastReturned);
Slice lastSlice = filter.requestedSlices().get(filter.requestedSlices().size() - 1);
ByteBufferUtil.writeWithShortLength(LegacyLayout.encodeBound(metadata, newStart, true), out);
ByteBufferUtil.writeWithShortLength(LegacyLayout.encodeClustering(metadata, lastSlice.end().clustering()), out);
LegacyRangeSliceCommandSerializer.serializeRowFilter(out, rangeCommand.rowFilter());
// command-level limit
// Pre-3.0 we would always request one more row than we actually needed and the command-level "start" would
// be the last-returned cell name, so the response would always include it.
int maxResults = rangeCommand.limits().count() + 1;
out.writeInt(maxResults);
// countCQL3Rows
if (rangeCommand.isForThrift() || rangeCommand.limits().perPartitionCount() == 1) // for Thrift or DISTINCT
out.writeBoolean(false);
else
out.writeBoolean(true);
}
public ReadCommand deserialize(DataInputPlus in, int version) throws IOException
{
assert version < MessagingService.VERSION_30;
String keyspace = in.readUTF();
String columnFamily = in.readUTF();
CFMetaData metadata = Schema.instance.getCFMetaData(keyspace, columnFamily);
if (metadata == null)
{
String message = String.format("Got legacy paged range command for nonexistent table %s.%s.", keyspace, columnFamily);
throw new UnknownColumnFamilyException(message, null);
}
int nowInSec = (int) (in.readLong() / 1000); // convert from millis to seconds
AbstractBounds<PartitionPosition> keyRange = AbstractBounds.rowPositionSerializer.deserialize(in, metadata.partitioner, version);
Pair<ClusteringIndexSliceFilter, Boolean> p = LegacyReadCommandSerializer.deserializeSlicePartitionFilter(in, metadata);
ClusteringIndexSliceFilter filter = p.left;
boolean selectsStatics = p.right;
int perPartitionLimit = in.readInt();
int compositesToGroup = in.readInt();
// command-level Composite "start" and "stop"
LegacyLayout.LegacyBound startBound = LegacyLayout.decodeSliceBound(metadata, ByteBufferUtil.readWithShortLength(in), true);
ByteBufferUtil.readWithShortLength(in); // the composite "stop", which isn't actually needed
ColumnFilter selection = LegacyRangeSliceCommandSerializer.getColumnSelectionForSlice(selectsStatics, compositesToGroup, metadata);
RowFilter rowFilter = LegacyRangeSliceCommandSerializer.deserializeRowFilter(in, metadata);
int maxResults = in.readInt();
boolean countCQL3Rows = in.readBoolean();
// We have 2 types of DISTINCT queries: ones on only the partition key, and ones on the partition key and static columns. For the former,
// we can easily detect the case because compositeToGroup is -2 and that's the only case it can be that. The latter one is slightly less
// direct, but we know that on 2.1/2.2 queries, DISTINCT queries are the only CQL queries that have countCQL3Rows to false so we use
// that fact.
boolean isDistinct = compositesToGroup == -2 || (compositesToGroup != -1 && !countCQL3Rows);
DataLimits limits;
if (isDistinct)
limits = DataLimits.distinctLimits(maxResults);
else
limits = DataLimits.cqlLimits(maxResults);
limits = limits.forPaging(maxResults);
// The pagedRangeCommand is used in pre-3.0 for both the first page and the following ones. On the first page, the startBound will be
// the start of the overall slice and will not be a proper Clustering. So detect that case and just return a non-paging DataRange, which
// is what 3.0 does.
DataRange dataRange = new DataRange(keyRange, filter);
Slices slices = filter.requestedSlices();
if (!isDistinct && startBound != LegacyLayout.LegacyBound.BOTTOM && !startBound.bound.equals(slices.get(0).start()))
{
// pre-3.0 nodes normally expect pages to include the last cell from the previous page, but they handle it
// missing without any problems, so we can safely always set "inclusive" to false in the data range
dataRange = dataRange.forPaging(keyRange, metadata.comparator, startBound.getAsClustering(metadata), false);
}
return PartitionRangeReadCommand.create(true, metadata, nowInSec, selection, rowFilter, limits, dataRange);
}
public long serializedSize(ReadCommand command, int version)
{
assert version < MessagingService.VERSION_30;
assert command.kind == Kind.PARTITION_RANGE;
PartitionRangeReadCommand rangeCommand = (PartitionRangeReadCommand) command;
CFMetaData metadata = rangeCommand.metadata();
assert rangeCommand.dataRange().isPaging();
long size = TypeSizes.sizeof(metadata.ksName);
size += TypeSizes.sizeof(metadata.cfName);
size += TypeSizes.sizeof((long) rangeCommand.nowInSec());
size += AbstractBounds.rowPositionSerializer.serializedSize(rangeCommand.dataRange().keyRange(), version);
// pre-3.0 nodes only accept slice filters for paged range commands
ClusteringIndexSliceFilter filter;
if (rangeCommand.dataRange().clusteringIndexFilter.kind() == ClusteringIndexFilter.Kind.NAMES)
filter = LegacyReadCommandSerializer.convertNamesFilterToSliceFilter((ClusteringIndexNamesFilter) rangeCommand.dataRange().clusteringIndexFilter, metadata);
else
filter = (ClusteringIndexSliceFilter) rangeCommand.dataRange().clusteringIndexFilter;
// slice filter
boolean makeStaticSlice = !rangeCommand.columnFilter().fetchedColumns().statics.isEmpty() && !filter.requestedSlices().selects(Clustering.STATIC_CLUSTERING);
size += LegacyReadCommandSerializer.serializedSlicesSize(filter.requestedSlices(), makeStaticSlice, metadata);
size += TypeSizes.sizeof(filter.isReversed());
// slice filter's count
size += TypeSizes.sizeof(rangeCommand.limits().perPartitionCount());
// compositesToGroup
size += TypeSizes.sizeof(0);
// command-level Composite "start" and "stop"
DataRange.Paging pagingRange = (DataRange.Paging) rangeCommand.dataRange();
Clustering lastReturned = pagingRange.getLastReturned();
Slice lastSlice = filter.requestedSlices().get(filter.requestedSlices().size() - 1);
size += ByteBufferUtil.serializedSizeWithShortLength(LegacyLayout.encodeClustering(metadata, lastReturned));
size += ByteBufferUtil.serializedSizeWithShortLength(LegacyLayout.encodeClustering(metadata, lastSlice.end().clustering()));
size += LegacyRangeSliceCommandSerializer.serializedRowFilterSize(rangeCommand.rowFilter());
// command-level limit
size += TypeSizes.sizeof(rangeCommand.limits().count());
// countCQL3Rows
return size + TypeSizes.sizeof(true);
}
}
/**
* Serializer for pre-3.0 ReadCommands.
*/
static class LegacyReadCommandSerializer implements IVersionedSerializer<ReadCommand>
{
public void serialize(ReadCommand command, DataOutputPlus out, int version) throws IOException
{
assert version < MessagingService.VERSION_30;
assert command.kind == Kind.SINGLE_PARTITION;
SinglePartitionReadCommand singleReadCommand = (SinglePartitionReadCommand) command;
singleReadCommand = maybeConvertNamesToSlice(singleReadCommand);
CFMetaData metadata = singleReadCommand.metadata();
out.writeByte(LegacyType.fromPartitionFilterKind(singleReadCommand.clusteringIndexFilter().kind()).serializedValue);
out.writeBoolean(singleReadCommand.isDigestQuery());
out.writeUTF(metadata.ksName);
ByteBufferUtil.writeWithShortLength(singleReadCommand.partitionKey().getKey(), out);
out.writeUTF(metadata.cfName);
out.writeLong(singleReadCommand.nowInSec() * 1000L); // convert from seconds to millis
if (singleReadCommand.clusteringIndexFilter().kind() == ClusteringIndexFilter.Kind.SLICE)
serializeSliceCommand(singleReadCommand, out);
else
serializeNamesCommand(singleReadCommand, out);
}
public ReadCommand deserialize(DataInputPlus in, int version) throws IOException
{
assert version < MessagingService.VERSION_30;
LegacyType msgType = LegacyType.fromSerializedValue(in.readByte());
boolean isDigest = in.readBoolean();
String keyspaceName = in.readUTF();
ByteBuffer key = ByteBufferUtil.readWithShortLength(in);
String cfName = in.readUTF();
long nowInMillis = in.readLong();
int nowInSeconds = (int) (nowInMillis / 1000); // convert from millis to seconds
CFMetaData metadata = Schema.instance.getCFMetaData(keyspaceName, cfName);
DecoratedKey dk = metadata.partitioner.decorateKey(key);
switch (msgType)
{
case GET_BY_NAMES:
return deserializeNamesCommand(in, isDigest, metadata, dk, nowInSeconds, version);
case GET_SLICES:
return deserializeSliceCommand(in, isDigest, metadata, dk, nowInSeconds, version);
default:
throw new AssertionError();
}
}
public long serializedSize(ReadCommand command, int version)
{
assert version < MessagingService.VERSION_30;
assert command.kind == Kind.SINGLE_PARTITION;
SinglePartitionReadCommand singleReadCommand = (SinglePartitionReadCommand) command;
singleReadCommand = maybeConvertNamesToSlice(singleReadCommand);
int keySize = singleReadCommand.partitionKey().getKey().remaining();
CFMetaData metadata = singleReadCommand.metadata();
long size = 1; // message type (single byte)
size += TypeSizes.sizeof(command.isDigestQuery());
size += TypeSizes.sizeof(metadata.ksName);
size += TypeSizes.sizeof((short) keySize) + keySize;
size += TypeSizes.sizeof((long) command.nowInSec());
if (singleReadCommand.clusteringIndexFilter().kind() == ClusteringIndexFilter.Kind.SLICE)
return size + serializedSliceCommandSize(singleReadCommand);
else
return size + serializedNamesCommandSize(singleReadCommand);
}
private void serializeNamesCommand(SinglePartitionReadCommand command, DataOutputPlus out) throws IOException
{
serializeNamesFilter(command, (ClusteringIndexNamesFilter)command.clusteringIndexFilter(), out);
}
private static void serializeNamesFilter(ReadCommand command, ClusteringIndexNamesFilter filter, DataOutputPlus out) throws IOException
{
PartitionColumns columns = command.columnFilter().fetchedColumns();
CFMetaData metadata = command.metadata();
SortedSet<Clustering> requestedRows = filter.requestedRows();
if (requestedRows.isEmpty())
{
// only static columns are requested
out.writeInt(columns.size());
for (ColumnDefinition column : columns)
ByteBufferUtil.writeWithShortLength(column.name.bytes, out);
}
else
{
out.writeInt(requestedRows.size() * columns.size());
for (Clustering clustering : requestedRows)
{
for (ColumnDefinition column : columns)
ByteBufferUtil.writeWithShortLength(LegacyLayout.encodeCellName(metadata, clustering, column.name.bytes, null), out);
}
}
// countCql3Rows should be true if it's not for Thrift or a DISTINCT query
if (command.isForThrift() || (command.limits().kind() == DataLimits.Kind.CQL_LIMIT && command.limits().perPartitionCount() == 1))
out.writeBoolean(false); // it's compact and not a DISTINCT query
else
out.writeBoolean(true);
}
static long serializedNamesFilterSize(ClusteringIndexNamesFilter filter, CFMetaData metadata, PartitionColumns fetchedColumns)
{
SortedSet<Clustering> requestedRows = filter.requestedRows();
long size = 0;
if (requestedRows.isEmpty())
{
// only static columns are requested
size += TypeSizes.sizeof(fetchedColumns.size());
for (ColumnDefinition column : fetchedColumns)
size += ByteBufferUtil.serializedSizeWithShortLength(column.name.bytes);
}
else
{
size += TypeSizes.sizeof(requestedRows.size() * fetchedColumns.size());
for (Clustering clustering : requestedRows)
{
for (ColumnDefinition column : fetchedColumns)
size += ByteBufferUtil.serializedSizeWithShortLength(LegacyLayout.encodeCellName(metadata, clustering, column.name.bytes, null));
}
}
return size + TypeSizes.sizeof(true); // countCql3Rows
}
private SinglePartitionReadCommand deserializeNamesCommand(DataInputPlus in, boolean isDigest, CFMetaData metadata, DecoratedKey key, int nowInSeconds, int version) throws IOException
{
Pair<ColumnFilter, ClusteringIndexNamesFilter> selectionAndFilter = deserializeNamesSelectionAndFilter(in, metadata);
return SinglePartitionReadCommand.legacyNamesCommand(isDigest, version, metadata, nowInSeconds, selectionAndFilter.left, key, selectionAndFilter.right);
}
static Pair<ColumnFilter, ClusteringIndexNamesFilter> deserializeNamesSelectionAndFilter(DataInputPlus in, CFMetaData metadata) throws IOException
{
int numCellNames = in.readInt();
// The names filter could include either a) static columns or b) normal columns with the clustering columns
// fully specified. We need to handle those cases differently in 3.0.
NavigableSet<Clustering> clusterings = new TreeSet<>(metadata.comparator);
ColumnFilter.Builder selectionBuilder = ColumnFilter.selectionBuilder();
for (int i = 0; i < numCellNames; i++)
{
ByteBuffer buffer = ByteBufferUtil.readWithShortLength(in);
LegacyLayout.LegacyCellName cellName;
try
{
cellName = LegacyLayout.decodeCellName(metadata, buffer);
}
catch (UnknownColumnException exc)
{
// TODO this probably needs a new exception class that shares a parent with UnknownColumnFamilyException
throw new UnknownColumnFamilyException(
"Received legacy range read command with names filter for unrecognized column name. " +
"Fill name in filter (hex): " + ByteBufferUtil.bytesToHex(buffer), metadata.cfId);
}
// If we're querying for a static column, we may also need to read it
// as if it were a thrift dynamic column (because the column metadata,
// which makes it a static column in 3.0+, may have been added *after*
// some values were written). Note that all cql queries on non-compact
// tables used slice & not name filters prior to 3.0 so this path is
// not taken for non-compact tables. It is theoretically possible to
// get here via thrift, hence the check on metadata.isStaticCompactTable.
// See CASSANDRA-11087.
if (metadata.isStaticCompactTable() && cellName.clustering.equals(Clustering.STATIC_CLUSTERING))
{
clusterings.add(Clustering.make(cellName.column.name.bytes));
selectionBuilder.add(metadata.compactValueColumn());
}
else
{
clusterings.add(cellName.clustering);
}
selectionBuilder.add(cellName.column);
}
// for compact storage tables without clustering keys, the column holding the selected value is named
// 'value' internally we add it to the selection here to prevent errors due to unexpected column names
// when serializing the initial local data response
if (metadata.isStaticCompactTable() && clusterings.isEmpty())
selectionBuilder.addAll(metadata.partitionColumns());
in.readBoolean(); // countCql3Rows
// clusterings cannot include STATIC_CLUSTERING, so if the names filter is for static columns, clusterings
// will be empty. However, by requesting the static columns in our ColumnFilter, this will still work.
ClusteringIndexNamesFilter filter = new ClusteringIndexNamesFilter(clusterings, false);
return Pair.create(selectionBuilder.build(), filter);
}
private long serializedNamesCommandSize(SinglePartitionReadCommand command)
{
ClusteringIndexNamesFilter filter = (ClusteringIndexNamesFilter)command.clusteringIndexFilter();
PartitionColumns columns = command.columnFilter().fetchedColumns();
return serializedNamesFilterSize(filter, command.metadata(), columns);
}
private void serializeSliceCommand(SinglePartitionReadCommand command, DataOutputPlus out) throws IOException
{
CFMetaData metadata = command.metadata();
ClusteringIndexSliceFilter filter = (ClusteringIndexSliceFilter)command.clusteringIndexFilter();
Slices slices = filter.requestedSlices();
boolean makeStaticSlice = !command.columnFilter().fetchedColumns().statics.isEmpty() && !slices.selects(Clustering.STATIC_CLUSTERING);
serializeSlices(out, slices, filter.isReversed(), makeStaticSlice, metadata);
out.writeBoolean(filter.isReversed());
boolean selectsStatics = !command.columnFilter().fetchedColumns().statics.isEmpty() || slices.selects(Clustering.STATIC_CLUSTERING);
DataLimits limits = command.limits();
if (limits.isDistinct())
out.writeInt(1); // the limit is always 1 for DISTINCT queries
else
out.writeInt(updateLimitForQuery(command.limits().count(), filter.requestedSlices()));
int compositesToGroup;
if (limits.kind() == DataLimits.Kind.THRIFT_LIMIT || metadata.isDense())
compositesToGroup = -1;
else if (limits.isDistinct() && !selectsStatics)
compositesToGroup = -2; // for DISTINCT queries (CASSANDRA-8490)
else
compositesToGroup = metadata.clusteringColumns().size();
out.writeInt(compositesToGroup);
}
private SinglePartitionReadCommand deserializeSliceCommand(DataInputPlus in, boolean isDigest, CFMetaData metadata, DecoratedKey key, int nowInSeconds, int version) throws IOException
{
Pair<ClusteringIndexSliceFilter, Boolean> p = deserializeSlicePartitionFilter(in, metadata);
ClusteringIndexSliceFilter filter = p.left;
boolean selectsStatics = p.right;
int count = in.readInt();
int compositesToGroup = in.readInt();
// if a slice query from a pre-3.0 node doesn't cover statics, we shouldn't select them at all
ColumnFilter columnFilter = LegacyRangeSliceCommandSerializer.getColumnSelectionForSlice(selectsStatics, compositesToGroup, metadata);
// We have 2 types of DISTINCT queries: ones on only the partition key, and ones on the partition key and static columns. For the former,
// we can easily detect the case because compositeToGroup is -2 and that's the only case it can be that. The latter is probablematic
// however as we have no way to distinguish it from a normal select with a limit of 1 (and this, contrarily to the range query case
// were the countCQL3Rows boolean allows us to decide).
// So we consider this case not distinct here. This is ok because even if it is a distinct (with static), the count will be 1 and
// we'll still just query one row (a distinct DataLimits currently behave exactly like a CQL limit with a count of 1). The only
// drawback is that we'll send back the first row entirely while a 2.1/2.2 node would return only the first cell in that same
// situation. This isn't a problem for 2.1/2.2 code however (it would be for a range query, as it would throw off the count for
// reasons similar to CASSANDRA-10762, but it's ok for single partition queries).
// We do _not_ want to do the reverse however and consider a 'SELECT * FROM foo LIMIT 1' as a DISTINCT query as that would make
// us only return the 1st cell rather then 1st row.
DataLimits limits;
if (compositesToGroup == -2)
limits = DataLimits.distinctLimits(count); // See CASSANDRA-8490 for the explanation of this value
else if (compositesToGroup == -1)
limits = DataLimits.thriftLimits(1, count);
else
limits = DataLimits.cqlLimits(count);
return SinglePartitionReadCommand.legacySliceCommand(isDigest, version, metadata, nowInSeconds, columnFilter, limits, key, filter);
}
private long serializedSliceCommandSize(SinglePartitionReadCommand command)
{
CFMetaData metadata = command.metadata();
ClusteringIndexSliceFilter filter = (ClusteringIndexSliceFilter)command.clusteringIndexFilter();
Slices slices = filter.requestedSlices();
boolean makeStaticSlice = !command.columnFilter().fetchedColumns().statics.isEmpty() && !slices.selects(Clustering.STATIC_CLUSTERING);
long size = serializedSlicesSize(slices, makeStaticSlice, metadata);
size += TypeSizes.sizeof(command.clusteringIndexFilter().isReversed());
size += TypeSizes.sizeof(command.limits().count());
return size + TypeSizes.sizeof(0); // compositesToGroup
}
static void serializeSlices(DataOutputPlus out, Slices slices, boolean isReversed, boolean makeStaticSlice, CFMetaData metadata) throws IOException
{
out.writeInt(slices.size() + (makeStaticSlice ? 1 : 0));
// In 3.0 we always store the slices in normal comparator order. Pre-3.0 nodes expect the slices to
// be in reversed order if the query is reversed, so we handle that here.
if (isReversed)
{
for (int i = slices.size() - 1; i >= 0; i--)
serializeSlice(out, slices.get(i), true, metadata);
if (makeStaticSlice)
serializeStaticSlice(out, true, metadata);
}
else
{
if (makeStaticSlice)
serializeStaticSlice(out, false, metadata);
for (Slice slice : slices)
serializeSlice(out, slice, false, metadata);
}
}
static long serializedSlicesSize(Slices slices, boolean makeStaticSlice, CFMetaData metadata)
{
long size = TypeSizes.sizeof(slices.size());
for (Slice slice : slices)
{
ByteBuffer sliceStart = LegacyLayout.encodeBound(metadata, slice.start(), true);
size += ByteBufferUtil.serializedSizeWithShortLength(sliceStart);
ByteBuffer sliceEnd = LegacyLayout.encodeBound(metadata, slice.end(), false);
size += ByteBufferUtil.serializedSizeWithShortLength(sliceEnd);
}
if (makeStaticSlice)
size += serializedStaticSliceSize(metadata);
return size;
}
static long serializedStaticSliceSize(CFMetaData metadata)
{
// unlike serializeStaticSlice(), but we don't care about reversal for size calculations
ByteBuffer sliceStart = LegacyLayout.encodeBound(metadata, ClusteringBound.BOTTOM, false);
long size = ByteBufferUtil.serializedSizeWithShortLength(sliceStart);
size += TypeSizes.sizeof((short) (metadata.comparator.size() * 3 + 2));
size += TypeSizes.sizeof((short) LegacyLayout.STATIC_PREFIX);
for (int i = 0; i < metadata.comparator.size(); i++)
{
size += ByteBufferUtil.serializedSizeWithShortLength(ByteBufferUtil.EMPTY_BYTE_BUFFER);
size += 1; // EOC
}
return size;
}
private static void serializeSlice(DataOutputPlus out, Slice slice, boolean isReversed, CFMetaData metadata) throws IOException
{
ByteBuffer sliceStart = LegacyLayout.encodeBound(metadata, isReversed ? slice.end() : slice.start(), !isReversed);
ByteBufferUtil.writeWithShortLength(sliceStart, out);
ByteBuffer sliceEnd = LegacyLayout.encodeBound(metadata, isReversed ? slice.start() : slice.end(), isReversed);
ByteBufferUtil.writeWithShortLength(sliceEnd, out);
}
private static void serializeStaticSlice(DataOutputPlus out, boolean isReversed, CFMetaData metadata) throws IOException
{
// if reversed, write an empty bound for the slice start; if reversed, write out an empty bound for the
// slice finish after we've written the static slice start
if (!isReversed)
{
ByteBuffer sliceStart = LegacyLayout.encodeBound(metadata, ClusteringBound.BOTTOM, false);
ByteBufferUtil.writeWithShortLength(sliceStart, out);
}
// write out the length of the composite
out.writeShort(2 + metadata.comparator.size() * 3); // two bytes + EOC for each component, plus static prefix
out.writeShort(LegacyLayout.STATIC_PREFIX);
for (int i = 0; i < metadata.comparator.size(); i++)
{
ByteBufferUtil.writeWithShortLength(ByteBufferUtil.EMPTY_BYTE_BUFFER, out);
// write the EOC, using an inclusive end if we're on the final component
out.writeByte(i == metadata.comparator.size() - 1 ? 1 : 0);
}
if (isReversed)
{
ByteBuffer sliceStart = LegacyLayout.encodeBound(metadata, ClusteringBound.BOTTOM, false);
ByteBufferUtil.writeWithShortLength(sliceStart, out);
}
}
// Returns the deserialized filter, and whether static columns are queried (in pre-3.0, both info are determined by the slices,
// but in 3.0 they are separated: whether static columns are queried or not depends on the ColumnFilter).
static Pair<ClusteringIndexSliceFilter, Boolean> deserializeSlicePartitionFilter(DataInputPlus in, CFMetaData metadata) throws IOException
{
int numSlices = in.readInt();
ByteBuffer[] startBuffers = new ByteBuffer[numSlices];
ByteBuffer[] finishBuffers = new ByteBuffer[numSlices];
for (int i = 0; i < numSlices; i++)
{
startBuffers[i] = ByteBufferUtil.readWithShortLength(in);
finishBuffers[i] = ByteBufferUtil.readWithShortLength(in);
}
boolean reversed = in.readBoolean();
if (reversed)
{
// pre-3.0, reversed query slices put the greater element at the start of the slice
ByteBuffer[] tmp = finishBuffers;
finishBuffers = startBuffers;
startBuffers = tmp;
}
boolean selectsStatics = false;
Slices.Builder slicesBuilder = new Slices.Builder(metadata.comparator);
for (int i = 0; i < numSlices; i++)
{
LegacyLayout.LegacyBound start = LegacyLayout.decodeSliceBound(metadata, startBuffers[i], true);
LegacyLayout.LegacyBound finish = LegacyLayout.decodeSliceBound(metadata, finishBuffers[i], false);
if (start.isStatic)
{
// If we start at the static block, this means we start at the beginning of the partition in 3.0
// terms (since 3.0 handles static outside of the slice).
start = LegacyLayout.LegacyBound.BOTTOM;
// Then if we include the static, records it
if (start.bound.isInclusive())
selectsStatics = true;
}
else if (start == LegacyLayout.LegacyBound.BOTTOM)
{
selectsStatics = true;
}
// If the end of the slice is the end of the statics, then that mean this slice was just selecting static
// columns. We have already recorded that in selectsStatics, so we can ignore the slice (which doesn't make
// sense for 3.0).
if (finish.isStatic)
{
assert finish.bound.isInclusive(); // it would make no sense for a pre-3.0 node to have a slice that stops
// before the static columns (since there is nothing before that)
continue;
}
slicesBuilder.add(Slice.make(start.bound, finish.bound));
}
return Pair.create(new ClusteringIndexSliceFilter(slicesBuilder.build(), reversed), selectsStatics);
}
private static SinglePartitionReadCommand maybeConvertNamesToSlice(SinglePartitionReadCommand command)
{
if (command.clusteringIndexFilter().kind() != ClusteringIndexFilter.Kind.NAMES)
return command;
CFMetaData metadata = command.metadata();
if (!shouldConvertNamesToSlice(metadata, command.columnFilter().fetchedColumns()))
return command;
ClusteringIndexNamesFilter filter = (ClusteringIndexNamesFilter)command.clusteringIndexFilter();
ClusteringIndexSliceFilter sliceFilter = convertNamesFilterToSliceFilter(filter, metadata);
return command.withUpdatedClusteringIndexFilter(sliceFilter);
}
/**
* Returns true if a names filter on the given table and column selection should be converted to a slice
* filter for compatibility with pre-3.0 nodes, false otherwise.
*/
static boolean shouldConvertNamesToSlice(CFMetaData metadata, PartitionColumns columns)
{
// On pre-3.0 nodes, due to CASSANDRA-5762, we always do a slice for CQL3 tables (not dense, composite).
if (!metadata.isDense() && metadata.isCompound())
return true;
// pre-3.0 nodes don't support names filters for reading collections, so if we're requesting any of those,
// we need to convert this to a slice filter
for (ColumnDefinition column : columns)
{
if (column.type.isMultiCell())
return true;
}
return false;
}
/**
* Converts a names filter that is incompatible with pre-3.0 nodes to a slice filter that is compatible.
*/
private static ClusteringIndexSliceFilter convertNamesFilterToSliceFilter(ClusteringIndexNamesFilter filter, CFMetaData metadata)
{
SortedSet<Clustering> requestedRows = filter.requestedRows();
Slices slices;
if (requestedRows.isEmpty())
{
slices = Slices.NONE;
}
else if (requestedRows.size() == 1 && requestedRows.first().size() == 0)
{
slices = Slices.ALL;
}
else
{
Slices.Builder slicesBuilder = new Slices.Builder(metadata.comparator);
for (Clustering clustering : requestedRows)
slicesBuilder.add(ClusteringBound.inclusiveStartOf(clustering), ClusteringBound.inclusiveEndOf(clustering));
slices = slicesBuilder.build();
}
return new ClusteringIndexSliceFilter(slices, filter.isReversed());
}
/**
* Potentially increases the existing query limit to account for the lack of exclusive bounds in pre-3.0 nodes.
* @param limit the existing query limit
* @param slices the requested slices
* @return the updated limit
*/
static int updateLimitForQuery(int limit, Slices slices)
{
// Pre-3.0 nodes don't support exclusive bounds for slices. Instead, we query one more element if necessary
// and filter it later (in LegacyRemoteDataResponse)
if (!slices.hasLowerBound() && ! slices.hasUpperBound())
return limit;
for (Slice slice : slices)
{
if (limit == Integer.MAX_VALUE)
return limit;
if (!slice.start().isInclusive())
limit++;
if (!slice.end().isInclusive())
limit++;
}
return limit;
}
}
}