src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java - cassandra - Git at Google

 /*
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  */
 package org.apache.cassandra.db.rows;

 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.Comparator;
 import java.util.List;

 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.filter.ClusteringIndexFilter;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.transform.RTBoundValidator;
 import org.apache.cassandra.io.sstable.IndexInfo;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
 import org.apache.cassandra.thrift.ThriftResultsMerger;
 import org.apache.cassandra.utils.IteratorWithLowerBound;

 /**
  * An unfiltered row iterator with a lower bound retrieved from either the global
  * sstable statistics or the row index lower bounds (if available in the cache).
  * Before initializing the sstable unfiltered row iterator, we return an empty row
  * with the clustering set to the lower bound. The empty row will be filtered out and
  * the result is that if we don't need to access this sstable, i.e. due to the LIMIT conditon,
  * then we will not. See CASSANDRA-8180 for examples of why this is useful.
  */
 public class UnfilteredRowIteratorWithLowerBound extends LazilyInitializedUnfilteredRowIterator implements IteratorWithLowerBound<Unfiltered>
 {
     private final SSTableReader sstable;
     private final ClusteringIndexFilter filter;
     private final ColumnFilter selectedColumns;
     private final boolean isForThrift;
     private final int nowInSec;
     private final boolean applyThriftTransformation;
     private final SSTableReadsListener listener;
     private ClusteringBound lowerBound;
     private boolean firstItemRetrieved;

     public UnfilteredRowIteratorWithLowerBound(DecoratedKey partitionKey,
                                                SSTableReader sstable,
                                                ClusteringIndexFilter filter,
                                                ColumnFilter selectedColumns,
                                                boolean isForThrift,
                                                int nowInSec,
                                                boolean applyThriftTransformation,
                                                SSTableReadsListener listener)
     {
         super(partitionKey);
         this.sstable = sstable;
         this.filter = filter;
         this.selectedColumns = selectedColumns;
         this.isForThrift = isForThrift;
         this.nowInSec = nowInSec;
         this.applyThriftTransformation = applyThriftTransformation;
         this.listener = listener;
         this.lowerBound = null;
         this.firstItemRetrieved = false;
     }

     public Unfiltered lowerBound()
     {
         if (lowerBound != null)
             return makeBound(lowerBound);

         // The partition index lower bound is more accurate than the sstable metadata lower bound but it is only
         // present if the iterator has already been initialized, which we only do when there are tombstones since in
         // this case we cannot use the sstable metadata clustering values
         ClusteringBound ret = getPartitionIndexLowerBound();
         return ret != null ? makeBound(ret) : makeBound(getMetadataLowerBound());
     }

     private Unfiltered makeBound(ClusteringBound bound)
     {
         if (bound == null)
             return null;

         if (lowerBound != bound)
             lowerBound = bound;

         return new RangeTombstoneBoundMarker(lowerBound, DeletionTime.LIVE);
     }

     @Override
     protected UnfilteredRowIterator initializeIterator()
     {
         @SuppressWarnings("resource") // 'iter' is added to iterators which is closed on exception, or through the closing of the final merged iterator
         UnfilteredRowIterator iter = sstable.iterator(partitionKey(), filter.getSlices(metadata()), selectedColumns, filter.isReversed(), isForThrift, listener);

         if (isForThrift && applyThriftTransformation)
             iter = ThriftResultsMerger.maybeWrap(iter, nowInSec);

         return RTBoundValidator.validate(iter, RTBoundValidator.Stage.SSTABLE, false);
     }

     @Override
     protected Unfiltered computeNext()
     {
         Unfiltered ret = super.computeNext();
         if (firstItemRetrieved)
             return ret;

         // Check that the lower bound is not bigger than the first item retrieved
         firstItemRetrieved = true;
         if (lowerBound != null && ret != null)
             assert comparator().compare(lowerBound, ret.clustering()) <= 0
                 : String.format("Lower bound [%s ]is bigger than first returned value [%s] for sstable %s",
                                 lowerBound.toString(sstable.metadata),
                                 ret.toString(sstable.metadata),
                                 sstable.getFilename());

         return ret;
     }

     private Comparator<Clusterable> comparator()
     {
         return filter.isReversed() ? sstable.metadata.comparator.reversed() : sstable.metadata.comparator;
     }

     @Override
     public CFMetaData metadata()
     {
         return sstable.metadata;
     }

     @Override
     public boolean isReverseOrder()
     {
         return filter.isReversed();
     }

     @Override
     public PartitionColumns columns()
     {
         return selectedColumns.fetchedColumns();
     }

     @Override
     public EncodingStats stats()
     {
         return sstable.stats();
     }

     @Override
     public DeletionTime partitionLevelDeletion()
     {
         if (!sstable.mayHaveTombstones())
             return DeletionTime.LIVE;

         return super.partitionLevelDeletion();
     }

     @Override
     public Row staticRow()
     {
         if (columns().statics.isEmpty())
             return Rows.EMPTY_STATIC_ROW;

         return super.staticRow();
     }

     /**
      * @return the lower bound stored on the index entry for this partition, if available.
      */
     private ClusteringBound getPartitionIndexLowerBound()
     {
         // NOTE: CASSANDRA-11206 removed the lookup against the key-cache as the IndexInfo objects are no longer
         // in memory for not heap backed IndexInfo objects (so, these are on disk).
         // CASSANDRA-11369 is there to fix this afterwards.

         // Creating the iterator ensures that rowIndexEntry is loaded if available (partitions bigger than
         // DatabaseDescriptor.column_index_size_in_kb)
         if (!canUseMetadataLowerBound())
             maybeInit();

         RowIndexEntry rowIndexEntry = sstable.getCachedPosition(partitionKey(), false);
         if (rowIndexEntry == null || !rowIndexEntry.indexOnHeap())
             return null;

         try (RowIndexEntry.IndexInfoRetriever onHeapRetriever = rowIndexEntry.openWithIndex(null))
         {
             IndexInfo column = onHeapRetriever.columnsIndex(filter.isReversed() ? rowIndexEntry.columnsIndexCount() - 1 : 0);
             ClusteringPrefix lowerBoundPrefix = filter.isReversed() ? column.lastName : column.firstName;
             assert lowerBoundPrefix.getRawValues().length <= sstable.metadata.comparator.size() :
             String.format("Unexpected number of clustering values %d, expected %d or fewer for %s",
                           lowerBoundPrefix.getRawValues().length,
                           sstable.metadata.comparator.size(),
                           sstable.getFilename());
             return ClusteringBound.inclusiveOpen(filter.isReversed(), lowerBoundPrefix.getRawValues());
         }
         catch (IOException e)
         {
             throw new RuntimeException("should never occur", e);
         }
     }

     /**
      * Whether we can use the clustering values in the stats of the sstable to build the lower bound.
      * <p>
      * Currently, the clustering values of the stats file records for each clustering component the min and max
      * value seen, null excluded. In other words, having a non-null value for a component in those min/max clustering
      * values does _not_ guarantee that there isn't an unfiltered in the sstable whose clustering has either no value for
      * that component (it's a prefix) or a null value.
      * <p>
      * This is problematic as this means we can't in general build a lower bound from those values since the "min"
      * values doesn't actually guarantee minimality.
      * <p>
      * However, we can use those values if we can guarantee that no clustering in the sstable 1) is a true prefix and
      * 2) uses null values. Nat having true prefixes means having no range tombstone markers since rows use
      * {@link Clustering} which is always "full" (all components are always present). As for null values, we happen to
      * only allow those in compact tables (for backward compatibility), so we can simply exclude those tables.
      * <p>
      * Note that the information we currently have at our disposal make this condition less precise that it could be.
      * In particular, {@link SSTableReader#mayHaveTombstones} could return {@code true} (making us not use the stats)
      * because of cell tombstone or even expiring cells even if the sstable has no range tombstone markers, even though
      * it's really only markers we want to exclude here (more precisely, as said above, we want to exclude anything
      * whose clustering is not "full", but that's only markers). It wouldn't be very hard to collect whether a sstable
      * has any range tombstone marker however so it's a possible improvement.
      */
     private boolean canUseMetadataLowerBound()
     {
         // Side-note: pre-2.1 sstable stat file had clustering value arrays whose size may not match the comparator size
         // and that would break getMetadataLowerBound. We don't support upgrade from 2.0 to 3.0 directly however so it's
         // not a true concern. Besides, !sstable.mayHaveTombstones already ensure this is a 3.0 sstable anyway.
         return !sstable.mayHaveTombstones() && !sstable.metadata.isCompactTable();
     }

     /**
      * @return a global lower bound made from the clustering values stored in the sstable metadata, note that
      * this currently does not correctly compare tombstone bounds, especially ranges.
      */
     private ClusteringBound getMetadataLowerBound()
     {
         if (!canUseMetadataLowerBound())
             return null;

         final StatsMetadata m = sstable.getSSTableMetadata();
         List<ByteBuffer> vals = filter.isReversed() ? m.maxClusteringValues : m.minClusteringValues;
         assert vals.size() <= sstable.metadata.comparator.size() :
         String.format("Unexpected number of clustering values %d, expected %d or fewer for %s",
                       vals.size(),
                       sstable.metadata.comparator.size(),
                       sstable.getFilename());
         return  ClusteringBound.inclusiveOpen(filter.isReversed(), vals.toArray(new ByteBuffer[vals.size()]));
     }
 }
	/*
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	*/
	package org.apache.cassandra.db.rows;

	import java.io.IOException;
	import java.nio.ByteBuffer;
	import java.util.Comparator;
	import java.util.List;

	import org.apache.cassandra.config.CFMetaData;
	import org.apache.cassandra.db.*;
	import org.apache.cassandra.db.filter.ClusteringIndexFilter;
	import org.apache.cassandra.db.filter.ColumnFilter;
	import org.apache.cassandra.db.transform.RTBoundValidator;
	import org.apache.cassandra.io.sstable.IndexInfo;
	import org.apache.cassandra.io.sstable.format.SSTableReader;
	import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
	import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
	import org.apache.cassandra.thrift.ThriftResultsMerger;
	import org.apache.cassandra.utils.IteratorWithLowerBound;

	/**
	* An unfiltered row iterator with a lower bound retrieved from either the global
	* sstable statistics or the row index lower bounds (if available in the cache).
	* Before initializing the sstable unfiltered row iterator, we return an empty row
	* with the clustering set to the lower bound. The empty row will be filtered out and
	* the result is that if we don't need to access this sstable, i.e. due to the LIMIT conditon,
	* then we will not. See CASSANDRA-8180 for examples of why this is useful.
	*/
	public class UnfilteredRowIteratorWithLowerBound extends LazilyInitializedUnfilteredRowIterator implements IteratorWithLowerBound<Unfiltered>
	{
	private final SSTableReader sstable;
	private final ClusteringIndexFilter filter;
	private final ColumnFilter selectedColumns;
	private final boolean isForThrift;
	private final int nowInSec;
	private final boolean applyThriftTransformation;
	private final SSTableReadsListener listener;
	private ClusteringBound lowerBound;
	private boolean firstItemRetrieved;

	public UnfilteredRowIteratorWithLowerBound(DecoratedKey partitionKey,
	SSTableReader sstable,
	ClusteringIndexFilter filter,
	ColumnFilter selectedColumns,
	boolean isForThrift,
	int nowInSec,
	boolean applyThriftTransformation,
	SSTableReadsListener listener)
	{
	super(partitionKey);
	this.sstable = sstable;
	this.filter = filter;
	this.selectedColumns = selectedColumns;
	this.isForThrift = isForThrift;
	this.nowInSec = nowInSec;
	this.applyThriftTransformation = applyThriftTransformation;
	this.listener = listener;
	this.lowerBound = null;
	this.firstItemRetrieved = false;
	}

	public Unfiltered lowerBound()
	{
	if (lowerBound != null)
	return makeBound(lowerBound);

	// The partition index lower bound is more accurate than the sstable metadata lower bound but it is only
	// present if the iterator has already been initialized, which we only do when there are tombstones since in
	// this case we cannot use the sstable metadata clustering values
	ClusteringBound ret = getPartitionIndexLowerBound();
	return ret != null ? makeBound(ret) : makeBound(getMetadataLowerBound());
	}

	private Unfiltered makeBound(ClusteringBound bound)
	{
	if (bound == null)
	return null;

	if (lowerBound != bound)
	lowerBound = bound;

	return new RangeTombstoneBoundMarker(lowerBound, DeletionTime.LIVE);
	}

	@Override
	protected UnfilteredRowIterator initializeIterator()
	{
	@SuppressWarnings("resource") // 'iter' is added to iterators which is closed on exception, or through the closing of the final merged iterator
	UnfilteredRowIterator iter = sstable.iterator(partitionKey(), filter.getSlices(metadata()), selectedColumns, filter.isReversed(), isForThrift, listener);

	if (isForThrift && applyThriftTransformation)
	iter = ThriftResultsMerger.maybeWrap(iter, nowInSec);

	return RTBoundValidator.validate(iter, RTBoundValidator.Stage.SSTABLE, false);
	}

	@Override
	protected Unfiltered computeNext()
	{
	Unfiltered ret = super.computeNext();
	if (firstItemRetrieved)
	return ret;

	// Check that the lower bound is not bigger than the first item retrieved
	firstItemRetrieved = true;
	if (lowerBound != null && ret != null)
	assert comparator().compare(lowerBound, ret.clustering()) <= 0
	: String.format("Lower bound [%s ]is bigger than first returned value [%s] for sstable %s",
	lowerBound.toString(sstable.metadata),
	ret.toString(sstable.metadata),
	sstable.getFilename());

	return ret;
	}

	private Comparator<Clusterable> comparator()
	{
	return filter.isReversed() ? sstable.metadata.comparator.reversed() : sstable.metadata.comparator;
	}

	@Override
	public CFMetaData metadata()
	{
	return sstable.metadata;
	}

	@Override
	public boolean isReverseOrder()
	{
	return filter.isReversed();
	}

	@Override
	public PartitionColumns columns()
	{
	return selectedColumns.fetchedColumns();
	}

	@Override
	public EncodingStats stats()
	{
	return sstable.stats();
	}

	@Override
	public DeletionTime partitionLevelDeletion()
	{
	if (!sstable.mayHaveTombstones())
	return DeletionTime.LIVE;

	return super.partitionLevelDeletion();
	}

	@Override
	public Row staticRow()
	{
	if (columns().statics.isEmpty())
	return Rows.EMPTY_STATIC_ROW;

	return super.staticRow();
	}

	/**
	* @return the lower bound stored on the index entry for this partition, if available.
	*/
	private ClusteringBound getPartitionIndexLowerBound()
	{
	// NOTE: CASSANDRA-11206 removed the lookup against the key-cache as the IndexInfo objects are no longer
	// in memory for not heap backed IndexInfo objects (so, these are on disk).
	// CASSANDRA-11369 is there to fix this afterwards.

	// Creating the iterator ensures that rowIndexEntry is loaded if available (partitions bigger than
	// DatabaseDescriptor.column_index_size_in_kb)
	if (!canUseMetadataLowerBound())
	maybeInit();

	RowIndexEntry rowIndexEntry = sstable.getCachedPosition(partitionKey(), false);
	if (rowIndexEntry == null \|\| !rowIndexEntry.indexOnHeap())
	return null;

	try (RowIndexEntry.IndexInfoRetriever onHeapRetriever = rowIndexEntry.openWithIndex(null))
	{
	IndexInfo column = onHeapRetriever.columnsIndex(filter.isReversed() ? rowIndexEntry.columnsIndexCount() - 1 : 0);
	ClusteringPrefix lowerBoundPrefix = filter.isReversed() ? column.lastName : column.firstName;
	assert lowerBoundPrefix.getRawValues().length <= sstable.metadata.comparator.size() :
	String.format("Unexpected number of clustering values %d, expected %d or fewer for %s",
	lowerBoundPrefix.getRawValues().length,
	sstable.metadata.comparator.size(),
	sstable.getFilename());
	return ClusteringBound.inclusiveOpen(filter.isReversed(), lowerBoundPrefix.getRawValues());
	}
	catch (IOException e)
	{
	throw new RuntimeException("should never occur", e);
	}
	}

	/**
	* Whether we can use the clustering values in the stats of the sstable to build the lower bound.
	* <p>
	* Currently, the clustering values of the stats file records for each clustering component the min and max
	* value seen, null excluded. In other words, having a non-null value for a component in those min/max clustering
	* values does _not_ guarantee that there isn't an unfiltered in the sstable whose clustering has either no value for
	* that component (it's a prefix) or a null value.
	* <p>
	* This is problematic as this means we can't in general build a lower bound from those values since the "min"
	* values doesn't actually guarantee minimality.
	* <p>
	* However, we can use those values if we can guarantee that no clustering in the sstable 1) is a true prefix and
	* 2) uses null values. Nat having true prefixes means having no range tombstone markers since rows use
	* {@link Clustering} which is always "full" (all components are always present). As for null values, we happen to
	* only allow those in compact tables (for backward compatibility), so we can simply exclude those tables.
	* <p>
	* Note that the information we currently have at our disposal make this condition less precise that it could be.
	* In particular, {@link SSTableReader#mayHaveTombstones} could return {@code true} (making us not use the stats)
	* because of cell tombstone or even expiring cells even if the sstable has no range tombstone markers, even though
	* it's really only markers we want to exclude here (more precisely, as said above, we want to exclude anything
	* whose clustering is not "full", but that's only markers). It wouldn't be very hard to collect whether a sstable
	* has any range tombstone marker however so it's a possible improvement.
	*/
	private boolean canUseMetadataLowerBound()
	{
	// Side-note: pre-2.1 sstable stat file had clustering value arrays whose size may not match the comparator size
	// and that would break getMetadataLowerBound. We don't support upgrade from 2.0 to 3.0 directly however so it's
	// not a true concern. Besides, !sstable.mayHaveTombstones already ensure this is a 3.0 sstable anyway.
	return !sstable.mayHaveTombstones() && !sstable.metadata.isCompactTable();
	}

	/**
	* @return a global lower bound made from the clustering values stored in the sstable metadata, note that
	* this currently does not correctly compare tombstone bounds, especially ranges.
	*/
	private ClusteringBound getMetadataLowerBound()
	{
	if (!canUseMetadataLowerBound())
	return null;

	final StatsMetadata m = sstable.getSSTableMetadata();
	List<ByteBuffer> vals = filter.isReversed() ? m.maxClusteringValues : m.minClusteringValues;
	assert vals.size() <= sstable.metadata.comparator.size() :
	String.format("Unexpected number of clustering values %d, expected %d or fewer for %s",
	vals.size(),
	sstable.metadata.comparator.size(),
	sstable.getFilename());
	return ClusteringBound.inclusiveOpen(filter.isReversed(), vals.toArray(new ByteBuffer[vals.size()]));
	}
	}