blob: 8d3f4f314b716a0c423885e088a54121c604d71f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db.columniterator;
import java.io.IOException;
import java.util.*;
import com.google.common.base.Verify;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.filter.ColumnFilter;
import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
import org.apache.cassandra.db.rows.*;
import org.apache.cassandra.io.sstable.format.SSTableReader;
import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
import org.apache.cassandra.io.util.FileDataInput;
import org.apache.cassandra.utils.AbstractIterator;
import org.apache.cassandra.utils.btree.BTree;
/**
* A Cell Iterator in reversed clustering order over SSTable
*/
public class SSTableReversedIterator extends AbstractSSTableIterator
{
public SSTableReversedIterator(SSTableReader sstable,
DecoratedKey key,
ColumnFilter columns,
boolean isForThrift,
SSTableReadsListener listener)
{
this(sstable, null, key, sstable.getPosition(key, SSTableReader.Operator.EQ, listener), columns, isForThrift);
}
public SSTableReversedIterator(SSTableReader sstable,
FileDataInput file,
DecoratedKey key,
RowIndexEntry indexEntry,
ColumnFilter columns,
boolean isForThrift)
{
super(sstable, file, key, indexEntry, columns, isForThrift);
}
protected Reader createReader(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
{
return indexEntry.isIndexed()
? new ReverseIndexedReader(indexEntry, file, shouldCloseFile)
: new ReverseReader(file, shouldCloseFile);
}
public boolean isReverseOrder()
{
return true;
}
private class ReverseReader extends Reader
{
protected ReusablePartitionData buffer;
protected Iterator<Unfiltered> iterator;
// Set in loadFromDisk () and used in setIterator to handle range tombstone extending on multiple index block. See
// loadFromDisk for details. Note that those are always false for non-indexed readers.
protected boolean skipFirstIteratedItem;
protected boolean skipLastIteratedItem;
protected Unfiltered mostRecentlyEmitted = null;
private ReverseReader(FileDataInput file, boolean shouldCloseFile)
{
super(file, shouldCloseFile);
}
protected ReusablePartitionData createBuffer(int blocksCount)
{
int estimatedRowCount = 16;
int columnCount = metadata().partitionColumns().regulars.size();
if (columnCount == 0 || metadata().clusteringColumns().isEmpty())
{
estimatedRowCount = 1;
}
else
{
try
{
// To avoid wasted resizing we guess-estimate the number of rows we're likely to read. For that
// we use the stats on the number of rows per partition for that sstable.
// FIXME: so far we only keep stats on cells, so to get a rough estimate on the number of rows,
// we divide by the number of regular columns the table has. We should fix once we collect the
// stats on rows
int estimatedRowsPerPartition = (int)(sstable.getEstimatedColumnCount().percentile(0.75) / columnCount);
estimatedRowCount = Math.max(estimatedRowsPerPartition / blocksCount, 1);
}
catch (IllegalStateException e)
{
// The EstimatedHistogram mean() method can throw this (if it overflows). While such overflow
// shouldn't happen, it's not worth taking the risk of letting the exception bubble up.
}
}
return new ReusablePartitionData(metadata(), partitionKey(), columns(), estimatedRowCount);
}
public void setForSlice(Slice slice) throws IOException
{
// If we have read the data, just create the iterator for the slice. Otherwise, read the data.
if (buffer == null)
{
buffer = createBuffer(1);
// Note that we can reuse that buffer between slices (we could alternatively re-read from disk
// every time, but that feels more wasteful) so we want to include everything from the beginning.
// We can stop at the slice end however since any following slice will be before that.
loadFromDisk(null, slice.end(), false, false, null, null);
}
setIterator(slice);
}
protected void setIterator(Slice slice)
{
assert buffer != null;
iterator = buffer.built.unfilteredIterator(columns, Slices.with(metadata().comparator, slice), true);
if (!iterator.hasNext())
return;
if (skipFirstIteratedItem)
iterator.next();
if (skipLastIteratedItem)
iterator = new SkipLastIterator(iterator);
}
protected boolean hasNextInternal() throws IOException
{
// If we've never called setForSlice, we're reading everything
if (iterator == null)
setForSlice(Slice.ALL);
return iterator.hasNext();
}
protected Unfiltered nextInternal() throws IOException
{
if (!hasNext())
throw new NoSuchElementException();
Unfiltered next = iterator.next();
mostRecentlyEmitted = next;
return next;
}
protected boolean stopReadingDisk()
{
return false;
}
// checks if left prefix precedes right prefix
private boolean precedes(ClusteringPrefix left, ClusteringPrefix right)
{
return metadata().comparator.compare(left, right) < 0;
}
// Reads the unfiltered from disk and load them into the reader buffer. It stops reading when either the partition
// is fully read, or when stopReadingDisk() returns true.
protected void loadFromDisk(Slice.Bound start,
Slice.Bound end,
boolean hasPreviousBlock,
boolean hasNextBlock,
ClusteringPrefix currentFirstName,
ClusteringPrefix nextLastName) throws IOException
{
// start != null means it's the block covering the beginning of the slice, so it has to be the last block for this slice.
assert start == null || !hasNextBlock;
buffer.reset();
skipFirstIteratedItem = false;
skipLastIteratedItem = false;
boolean isFirst = true;
// If the start might be in this block, skip everything that comes before it.
if (start != null)
{
while (deserializer.hasNext() && deserializer.compareNextTo(start) <= 0 && !stopReadingDisk())
{
isFirst = false;
if (deserializer.nextIsRow())
deserializer.skipNext();
else
updateOpenMarker((RangeTombstoneMarker)deserializer.readNext());
}
}
// If we have an open marker, it's either one from what we just skipped or it's one that open in the next (or
// one of the next) index block (if openMarker == openMarkerAtStartOfBlock).
if (openMarker != null)
{
// We have to feed a marker to the buffer, because that marker is likely to be close later and ImmtableBTreePartition
// doesn't take kindly to marker that comes without their counterpart. If that's the last block we're gonna read (for
// the current slice at least) it's easy because we'll want to return that open marker at the end of the data in this
// block anyway, so we have nothing more to do than adding it to the buffer.
// If it's not the last block however, in which case we know we'll have start == null, it means this marker is really
// open in a next block and so while we do need to add it the buffer for the reason mentioned above, we don't
// want to "return" it just yet, we'll wait until we reach it in the next blocks. That's why we trigger
// skipLastIteratedItem in that case (this is first item of the block, but we're iterating in reverse order
// so it will be last returned by the iterator).
RangeTombstone.Bound markerStart = start == null ? RangeTombstone.Bound.BOTTOM : RangeTombstone.Bound.fromSliceBound(start);
buffer.add(new RangeTombstoneBoundMarker(markerStart, openMarker));
if (hasNextBlock)
skipLastIteratedItem = true;
}
// Now deserialize everything until we reach our requested end (if we have one)
// See SSTableIterator.ForwardRead.computeNext() for why this is a strict inequality below: this is the same
// reasoning here.
while (deserializer.hasNext()
&& (end == null || deserializer.compareNextTo(end) < 0)
&& !stopReadingDisk())
{
Unfiltered unfiltered = deserializer.readNext();
if (isFirst && openMarker == null
&& currentFirstName != null && nextLastName != null
&& (precedes(currentFirstName, nextLastName) || precedes(unfiltered.clustering(), currentFirstName)))
{
// Range tombstones spanning multiple index blocks when reading legacy sstables need special handling.
// Pre-3.0, the column index didn't encode open markers. Instead, open range tombstones were rewritten
// at the start of index blocks they at least partially covered. These rewritten RTs found at the
// beginning of index blocks need to be handled as though they were an open marker, otherwise iterator
// validation will fail and/or some rows will be excluded from the result. These rewritten RTs can be
// detected based on their relation to the current index block and the next one depending on what wrote
// the sstable. For sstables coming from a memtable flush, a rewritten RT will have a clustering value
// less than the first name of its index block. For sstables coming from compaction, the index block
// first name will be the RT open bound, which will be less than the last name of the next block. So,
// here we compare the first name of this block to the last name of the next block to detect the
// compaction case, and clustering value of the unfiltered we just read to the index block's first name
// to detect the flush case.
Verify.verify(!sstable.descriptor.version.storeRows());
Verify.verify(openMarker == null);
Verify.verify(!skipLastIteratedItem);
Verify.verify(unfiltered.isRangeTombstoneMarker());
buffer.add(unfiltered);
if (hasNextBlock)
skipLastIteratedItem = true;
}
else if (isFirst && nextLastName != null && !precedes(nextLastName, unfiltered.clustering()))
{
// When dealing with old format sstable, we have the problem that a row can span 2 index block, i.e. it can
// start at the end of a block and end at the beginning of the next one. That's not a problem per se for
// UnfilteredDeserializer.OldFormatSerializer, since it always read rows entirely, even if they span index
// blocks, but as we reading index block in reverse we must be careful to not read the end of the row at
// beginning of a block before we're reading the beginning of that row. So what we do is that if we detect
// that the row starting this block is also the row ending the next one we're read (previous on disk), then
// we'll skip that first result and let it be read with the next block.
Verify.verify(!sstable.descriptor.version.storeRows());
isFirst = false;
}
else if (unfiltered.isEmpty())
{
isFirst = false;
}
else
{
buffer.add(unfiltered);
isFirst = false;
}
if (unfiltered.isRangeTombstoneMarker())
updateOpenMarker((RangeTombstoneMarker)unfiltered);
}
if (!sstable.descriptor.version.storeRows()
&& deserializer.hasNext()
&& (end == null || deserializer.compareNextTo(end) < 0))
{
// Range tombstone start and end bounds are stored together in legacy sstables. When we read one, we
// stash the closing bound until we reach the appropriate place to emit it, which is immediately before
// the next unfiltered with a greater clustering.
// If SSTRI considers the block exhausted before encountering such a clustering though, this end marker
// will never be emitted. So here we just check if there's a closing bound left in the deserializer.
// If there is, we compare it against the most recently emitted unfiltered (i.e.: the last unfiltered
// that this RT would enclose. And we have to do THAT comparison because the last name field on the
// current index block will be whatever was written at the end of the index block (i.e. the last name
// physically in the block), not the closing bound of the range tombstone (i.e. the last name logically
// in the block). If all this indicates that there is indeed a range tombstone we're missing, we add it
// to the buffer and update the open marker field.
Unfiltered unfiltered = deserializer.readNext();
RangeTombstoneMarker marker = unfiltered.isRangeTombstoneMarker() ? (RangeTombstoneMarker) unfiltered : null;
if (marker != null && marker.isClose(false)
&& (mostRecentlyEmitted == null || precedes(marker.clustering(), mostRecentlyEmitted.clustering())))
{
buffer.add(marker);
updateOpenMarker(marker);
}
}
// If we have an open marker, we should close it before finishing
if (openMarker != null)
{
// This is the reverse problem than the one at the start of the block. Namely, if it's the first block
// we deserialize for the slice (the one covering the slice end basically), then it's easy, we just want
// to add the close marker to the buffer and return it normally.
// If it's note our first block (for the slice) however, it means that marker closed in a previously read
// block and we have already returned it. So while we should still add it to the buffer for the sake of
// not breaking ImmutableBTreePartition, we should skip it when returning from the iterator, hence the
// skipFirstIteratedItem (this is the last item of the block, but we're iterating in reverse order so it will
// be the first returned by the iterator).
RangeTombstone.Bound markerEnd = end == null ? RangeTombstone.Bound.TOP : RangeTombstone.Bound.fromSliceBound(end);
buffer.add(new RangeTombstoneBoundMarker(markerEnd, openMarker));
if (hasPreviousBlock)
skipFirstIteratedItem = true;
}
buffer.build();
}
}
private class ReverseIndexedReader extends ReverseReader
{
private final IndexState indexState;
// The slice we're currently iterating over
private Slice slice;
// The last index block to consider for the slice
private int lastBlockIdx;
private ReverseIndexedReader(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
{
super(file, shouldCloseFile);
this.indexState = new IndexState(this, sstable.metadata.comparator, indexEntry, true);
}
@Override
public void setForSlice(Slice slice) throws IOException
{
this.slice = slice;
// if our previous slicing already got us past the beginning of the sstable, we're done
if (indexState.isDone())
{
iterator = Collections.emptyIterator();
return;
}
// Find the first index block we'll need to read for the slice.
int startIdx = indexState.findBlockIndex(slice.end(), indexState.currentBlockIdx());
if (startIdx < 0)
{
iterator = Collections.emptyIterator();
indexState.setToBlock(startIdx);
return;
}
lastBlockIdx = indexState.findBlockIndex(slice.start(), startIdx);
// If the last block to look (in reverse order) is after the very last block, we have nothing for that slice
if (lastBlockIdx >= indexState.blocksCount())
{
assert startIdx >= indexState.blocksCount();
iterator = Collections.emptyIterator();
return;
}
// If we start (in reverse order) after the very last block, just read from the last one.
if (startIdx >= indexState.blocksCount())
startIdx = indexState.blocksCount() - 1;
// Note that even if we were already set on the proper block (which would happen if the previous slice
// requested ended on the same block this one start), we can't reuse it because when reading the previous
// slice we've only read that block from the previous slice start. Re-reading also handles
// skipFirstIteratedItem/skipLastIteratedItem that we would need to handle otherwise.
indexState.setToBlock(startIdx);
readCurrentBlock(false, startIdx != lastBlockIdx);
}
@Override
protected boolean hasNextInternal() throws IOException
{
if (super.hasNextInternal())
return true;
while (true)
{
// We have nothing more for our current block, move the next one (so the one before on disk).
int nextBlockIdx = indexState.currentBlockIdx() - 1;
if (nextBlockIdx < 0 || nextBlockIdx < lastBlockIdx)
return false;
// The slice start can be in
indexState.setToBlock(nextBlockIdx);
readCurrentBlock(true, nextBlockIdx != lastBlockIdx);
// If an indexed block only contains data for a dropped column, the iterator will be empty, even
// though we may still have data to read in subsequent blocks
// also, for pre-3.0 storage formats, index blocks that only contain a single row and that row crosses
// index boundaries, the iterator will be empty even though we haven't read everything we're intending
// to read. In that case, we want to read the next index block. This shouldn't be possible in 3.0+
// formats (see next comment)
if (!iterator.hasNext() && nextBlockIdx > lastBlockIdx)
{
continue;
}
return iterator.hasNext();
}
}
/**
* Reads the current block, the last one we've set.
*
* @param hasPreviousBlock is whether we have already read a previous block for the current slice.
* @param hasNextBlock is whether we have more blocks to read for the current slice.
*/
private void readCurrentBlock(boolean hasPreviousBlock, boolean hasNextBlock) throws IOException
{
if (buffer == null)
buffer = createBuffer(indexState.blocksCount());
int currentBlock = indexState.currentBlockIdx();
// The slice start (resp. slice end) is only meaningful on the last (resp. first) block read (since again,
// we read blocks in reverse order).
boolean canIncludeSliceStart = !hasNextBlock;
boolean canIncludeSliceEnd = !hasPreviousBlock;
ClusteringPrefix currentFirstName = null;
ClusteringPrefix nextLastName = null;
if (!sstable.descriptor.version.storeRows() && currentBlock > 0)
{
currentFirstName = indexState.index(currentBlock).firstName;
nextLastName = indexState.index(currentBlock - 1).lastName;
}
loadFromDisk(canIncludeSliceStart ? slice.start() : null,
canIncludeSliceEnd ? slice.end() : null,
hasPreviousBlock,
hasNextBlock,
currentFirstName,
nextLastName
);
setIterator(slice);
}
@Override
protected boolean stopReadingDisk()
{
return indexState.isPastCurrentBlock();
}
}
private class ReusablePartitionData
{
private final CFMetaData metadata;
private final DecoratedKey partitionKey;
private final PartitionColumns columns;
private MutableDeletionInfo.Builder deletionBuilder;
private MutableDeletionInfo deletionInfo;
private BTree.Builder<Row> rowBuilder;
private ImmutableBTreePartition built;
private ReusablePartitionData(CFMetaData metadata,
DecoratedKey partitionKey,
PartitionColumns columns,
int initialRowCapacity)
{
this.metadata = metadata;
this.partitionKey = partitionKey;
this.columns = columns;
this.rowBuilder = BTree.builder(metadata.comparator, initialRowCapacity);
}
public void add(Unfiltered unfiltered)
{
if (unfiltered.isRow())
rowBuilder.add((Row)unfiltered);
else
deletionBuilder.add((RangeTombstoneMarker)unfiltered);
}
public void reset()
{
built = null;
rowBuilder.reuse();
deletionBuilder = MutableDeletionInfo.builder(partitionLevelDeletion, metadata().comparator, false);
}
public void build()
{
deletionInfo = deletionBuilder.build();
built = new ImmutableBTreePartition(metadata, partitionKey, columns, Rows.EMPTY_STATIC_ROW, rowBuilder.build(),
deletionInfo, EncodingStats.NO_STATS);
deletionBuilder = null;
}
}
private static class SkipLastIterator extends AbstractIterator<Unfiltered>
{
private final Iterator<Unfiltered> iterator;
private SkipLastIterator(Iterator<Unfiltered> iterator)
{
this.iterator = iterator;
}
protected Unfiltered computeNext()
{
if (!iterator.hasNext())
return endOfData();
Unfiltered next = iterator.next();
return iterator.hasNext() ? next : endOfData();
}
}
}