src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java - cassandra - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.cassandra.db.partitions;

 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.concurrent.TimeUnit;

 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import org.apache.cassandra.config.CFMetaData;
 import org.apache.cassandra.config.ColumnDefinition;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.io.util.*;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.NoSpamLogger;
 import org.apache.cassandra.utils.btree.BTree;
 import org.apache.cassandra.utils.btree.UpdateFunction;

 /**
  * Stores updates made on a partition.
  * <p>
  * A PartitionUpdate object requires that all writes/additions are performed before we
  * try to read the updates (attempts to write to the PartitionUpdate after a read method
  * has been called will result in an exception being thrown). In other words, a Partition
  * is mutable while it's written but becomes immutable as soon as it is read.
  * <p>
  * A typical usage is to create a new update ({@code new PartitionUpdate(metadata, key, columns, capacity)})
  * and then add rows and range tombstones through the {@code add()} methods (the partition
  * level deletion time can also be set with {@code addPartitionDeletion()}). However, there
  * is also a few static helper constructor methods for special cases ({@code emptyUpdate()},
  * {@code fullPartitionDelete} and {@code singleRowUpdate}).
  */
 public class PartitionUpdate extends AbstractBTreePartition
 {
     protected static final Logger logger = LoggerFactory.getLogger(PartitionUpdate.class);

     public static final PartitionUpdateSerializer serializer = new PartitionUpdateSerializer();

     private final int createdAtInSec = FBUtilities.nowInSeconds();

     // Records whether this update is "built", i.e. if the build() method has been called, which
     // happens when the update is read. Further writing is then rejected though a manual call
     // to allowNewUpdates() allow new writes. We could make that more implicit but only triggers
     // really requires that so we keep it simple for now).
     private volatile boolean isBuilt;
     private boolean canReOpen = true;

     private Holder holder;
     private BTree.Builder<Row> rowBuilder;
     private MutableDeletionInfo deletionInfo;

     private final boolean canHaveShadowedData;

     private PartitionUpdate(CFMetaData metadata,
                             DecoratedKey key,
                             PartitionColumns columns,
                             MutableDeletionInfo deletionInfo,
                             int initialRowCapacity,
                             boolean canHaveShadowedData)
     {
         super(metadata, key);
         this.deletionInfo = deletionInfo;
         this.holder = new Holder(columns, BTree.empty(), deletionInfo, Rows.EMPTY_STATIC_ROW, EncodingStats.NO_STATS);
         this.canHaveShadowedData = canHaveShadowedData;
         rowBuilder = builder(initialRowCapacity);
     }

     private PartitionUpdate(CFMetaData metadata,
                             DecoratedKey key,
                             Holder holder,
                             MutableDeletionInfo deletionInfo,
                             boolean canHaveShadowedData)
     {
         super(metadata, key);
         this.holder = holder;
         this.deletionInfo = deletionInfo;
         this.isBuilt = true;
         this.canHaveShadowedData = canHaveShadowedData;
     }

     public PartitionUpdate(CFMetaData metadata,
                            DecoratedKey key,
                            PartitionColumns columns,
                            int initialRowCapacity)
     {
         this(metadata, key, columns, MutableDeletionInfo.live(), initialRowCapacity, true);
     }

     public PartitionUpdate(CFMetaData metadata,
                            ByteBuffer key,
                            PartitionColumns columns,
                            int initialRowCapacity)
     {
         this(metadata,
              metadata.decorateKey(key),
              columns,
              initialRowCapacity);
     }

     /**
      * Creates a empty immutable partition update.
      *
      * @param metadata the metadata for the created update.
      * @param key the partition key for the created update.
      *
      * @return the newly created empty (and immutable) update.
      */
     public static PartitionUpdate emptyUpdate(CFMetaData metadata, DecoratedKey key)
     {
         MutableDeletionInfo deletionInfo = MutableDeletionInfo.live();
         Holder holder = new Holder(PartitionColumns.NONE, BTree.empty(), deletionInfo, Rows.EMPTY_STATIC_ROW, EncodingStats.NO_STATS);
         return new PartitionUpdate(metadata, key, holder, deletionInfo, false);
     }

     /**
      * Creates an immutable partition update that entirely deletes a given partition.
      *
      * @param metadata the metadata for the created update.
      * @param key the partition key for the partition that the created update should delete.
      * @param timestamp the timestamp for the deletion.
      * @param nowInSec the current time in seconds to use as local deletion time for the partition deletion.
      *
      * @return the newly created partition deletion update.
      */
     public static PartitionUpdate fullPartitionDelete(CFMetaData metadata, DecoratedKey key, long timestamp, int nowInSec)
     {
         MutableDeletionInfo deletionInfo = new MutableDeletionInfo(timestamp, nowInSec);
         Holder holder = new Holder(PartitionColumns.NONE, BTree.empty(), deletionInfo, Rows.EMPTY_STATIC_ROW, EncodingStats.NO_STATS);
         return new PartitionUpdate(metadata, key, holder, deletionInfo, false);
     }

     /**
      * Creates an immutable partition update that contains a single row update.
      *
      * @param metadata the metadata for the created update.
      * @param key the partition key for the partition to update.
      * @param row the row for the update (may be null).
      * @param row the static row for the update (may be null).
      *
      * @return the newly created partition update containing only {@code row}.
      */
     public static PartitionUpdate singleRowUpdate(CFMetaData metadata, DecoratedKey key, Row row, Row staticRow)
     {
         MutableDeletionInfo deletionInfo = MutableDeletionInfo.live();
         Holder holder = new Holder(
             new PartitionColumns(
                 staticRow == null ? Columns.NONE : Columns.from(staticRow.columns()),
                 row == null ? Columns.NONE : Columns.from(row.columns())
             ),
             row == null ? BTree.empty() : BTree.singleton(row),
             deletionInfo,
             staticRow == null ? Rows.EMPTY_STATIC_ROW : staticRow,
             EncodingStats.NO_STATS
         );
         return new PartitionUpdate(metadata, key, holder, deletionInfo, false);
     }

     /**
      * Creates an immutable partition update that contains a single row update.
      *
      * @param metadata the metadata for the created update.
      * @param key the partition key for the partition to update.
      * @param row the row for the update (may be static).
      *
      * @return the newly created partition update containing only {@code row}.
      */
     public static PartitionUpdate singleRowUpdate(CFMetaData metadata, DecoratedKey key, Row row)
     {
         return singleRowUpdate(metadata, key, row.isStatic() ? null : row, row.isStatic() ? row : null);
     }

     /**
      * Creates an immutable partition update that contains a single row update.
      *
      * @param metadata the metadata for the created update.
      * @param key the partition key for the partition to update.
      * @param row the row for the update.
      *
      * @return the newly created partition update containing only {@code row}.
      */
     public static PartitionUpdate singleRowUpdate(CFMetaData metadata, ByteBuffer key, Row row)
     {
         return singleRowUpdate(metadata, metadata.decorateKey(key), row);
     }

     /**
      * Turns the given iterator into an update.
      *
      * @param iterator the iterator to turn into updates.
      * @param filter the column filter used when querying {@code iterator}. This is used to make
      * sure we don't include data for which the value has been skipped while reading (as we would
      * then be writing something incorrect).
      *
      * Warning: this method does not close the provided iterator, it is up to
      * the caller to close it.
      */
     public static PartitionUpdate fromIterator(UnfilteredRowIterator iterator, ColumnFilter filter)
     {

         return fromIterator(iterator, filter, true,  null);
     }

     private static final NoSpamLogger rowMergingLogger = NoSpamLogger.getLogger(logger, 1, TimeUnit.MINUTES);
     /**
      * Removes duplicate rows from incoming iterator, to be used when we can't trust the underlying iterator (like when reading legacy sstables)
      */
     public static PartitionUpdate fromPre30Iterator(UnfilteredRowIterator iterator, ColumnFilter filter)
     {
         return fromIterator(iterator, filter, false, (a, b) -> {
             CFMetaData cfm = iterator.metadata();
             rowMergingLogger.warn(String.format("Merging rows from pre 3.0 iterator for partition key: %s",
                                                 cfm.getKeyValidator().getString(iterator.partitionKey().getKey())));
             return Rows.merge(a, b, FBUtilities.nowInSeconds());
         });
     }

     private static PartitionUpdate fromIterator(UnfilteredRowIterator iterator, ColumnFilter filter, boolean ordered, BTree.Builder.QuickResolver<Row> quickResolver)
     {
         iterator = UnfilteredRowIterators.withOnlyQueriedData(iterator, filter);
         Holder holder = build(iterator, 16, ordered, quickResolver);
         MutableDeletionInfo deletionInfo = (MutableDeletionInfo) holder.deletionInfo;
         return new PartitionUpdate(iterator.metadata(), iterator.partitionKey(), holder, deletionInfo, false);
     }

     /**
      * Turns the given iterator into an update.
      *
      * @param iterator the iterator to turn into updates.
      * @param filter the column filter used when querying {@code iterator}. This is used to make
      * sure we don't include data for which the value has been skipped while reading (as we would
      * then be writing something incorrect).
      *
      * Warning: this method does not close the provided iterator, it is up to
      * the caller to close it.
      */
     public static PartitionUpdate fromIterator(RowIterator iterator, ColumnFilter filter)
     {
         iterator = RowIterators.withOnlyQueriedData(iterator, filter);
         MutableDeletionInfo deletionInfo = MutableDeletionInfo.live();
         Holder holder = build(iterator, deletionInfo, true, 16);
         return new PartitionUpdate(iterator.metadata(), iterator.partitionKey(), holder, deletionInfo, false);
     }

     protected boolean canHaveShadowedData()
     {
         return canHaveShadowedData;
     }

     /**
      * Deserialize a partition update from a provided byte buffer.
      *
      * @param bytes the byte buffer that contains the serialized update.
      * @param version the version with which the update is serialized.
      * @param key the partition key for the update. This is only used if {@code version &lt 3.0}
      * and can be {@code null} otherwise.
      *
      * @return the deserialized update or {@code null} if {@code bytes == null}.
      */
     public static PartitionUpdate fromBytes(ByteBuffer bytes, int version, DecoratedKey key)
     {
         if (bytes == null)
             return null;

         try
         {
             return serializer.deserialize(new DataInputBuffer(bytes, true),
                                           version,
                                           SerializationHelper.Flag.LOCAL,
                                           version < MessagingService.VERSION_30 ? key : null);
         }
         catch (IOException e)
         {
             throw new RuntimeException(e);
         }
     }

     /**
      * Serialize a partition update as a byte buffer.
      *
      * @param update the partition update to serialize.
      * @param version the version to serialize the update into.
      *
      * @return a newly allocated byte buffer containing the serialized update.
      */
     public static ByteBuffer toBytes(PartitionUpdate update, int version)
     {
         try (DataOutputBuffer out = new DataOutputBuffer())
         {
             serializer.serialize(update, out, version);
             return out.buffer();
         }
         catch (IOException e)
         {
             throw new RuntimeException(e);
         }
     }

     /**
      * Creates a partition update that entirely deletes a given partition.
      *
      * @param metadata the metadata for the created update.
      * @param key the partition key for the partition that the created update should delete.
      * @param timestamp the timestamp for the deletion.
      * @param nowInSec the current time in seconds to use as local deletion time for the partition deletion.
      *
      * @return the newly created partition deletion update.
      */
     public static PartitionUpdate fullPartitionDelete(CFMetaData metadata, ByteBuffer key, long timestamp, int nowInSec)
     {
         return fullPartitionDelete(metadata, metadata.decorateKey(key), timestamp, nowInSec);
     }

     /**
      * Merges the provided updates, yielding a new update that incorporates all those updates.
      *
      * @param updates the collection of updates to merge. This shouldn't be empty.
      *
      * @return a partition update that include (merge) all the updates from {@code updates}.
      */
     public static PartitionUpdate merge(List<PartitionUpdate> updates)
     {
         assert !updates.isEmpty();
         final int size = updates.size();

         if (size == 1)
             return Iterables.getOnlyElement(updates);

         int nowInSecs = FBUtilities.nowInSeconds();
         List<UnfilteredRowIterator> asIterators = Lists.transform(updates, AbstractBTreePartition::unfilteredIterator);
         return fromIterator(UnfilteredRowIterators.merge(asIterators, nowInSecs), ColumnFilter.all(updates.get(0).metadata()));
     }

     // We override this, because the version in the super-class calls holder(), which build the update preventing
     // further updates, but that's not necessary here and being able to check at least the partition deletion without
     // "locking" the update is nice (and used in DataResolver.RepairMergeListener.MergeListener).
     @Override
     public DeletionInfo deletionInfo()
     {
         return deletionInfo;
     }

     /**
      * Modify this update to set every timestamp for live data to {@code newTimestamp} and
      * every deletion timestamp to {@code newTimestamp - 1}.
      *
      * There is no reason to use that expect on the Paxos code path, where we need ensure that
      * anything inserted use the ballot timestamp (to respect the order of update decided by
      * the Paxos algorithm). We use {@code newTimestamp - 1} for deletions because tombstones
      * always win on timestamp equality and we don't want to delete our own insertions
      * (typically, when we overwrite a collection, we first set a complex deletion to delete the
      * previous collection before adding new elements. If we were to set that complex deletion
      * to the same timestamp that the new elements, it would delete those elements). And since
      * tombstones always wins on timestamp equality, using -1 guarantees our deletion will still
      * delete anything from a previous update.
      */
     public void updateAllTimestamp(long newTimestamp)
     {
         Holder holder = holder();
         deletionInfo.updateAllTimestamp(newTimestamp - 1);
         Object[] tree = BTree.<Row>transformAndFilter(holder.tree, (x) -> x.updateAllTimestamp(newTimestamp));
         Row staticRow = holder.staticRow.updateAllTimestamp(newTimestamp);
         EncodingStats newStats = EncodingStats.Collector.collect(staticRow, BTree.<Row>iterator(tree), deletionInfo);
         this.holder = new Holder(holder.columns, tree, deletionInfo, staticRow, newStats);
     }

     /**
      * The number of "operations" contained in the update.
      * <p>
      * This is used by {@code Memtable} to approximate how much work this update does. In practice, this
      * count how many rows are updated and how many ranges are deleted by the partition update.
      *
      * @return the number of "operations" performed by the update.
      */
     public int operationCount()
     {
         return rowCount()
              + (staticRow().isEmpty() ? 0 : 1)
              + deletionInfo.rangeCount()
              + (deletionInfo.getPartitionDeletion().isLive() ? 0 : 1);
     }

     /**
      * The size of the data contained in this update.
      *
      * @return the size of the data contained in this update.
      */
     public int dataSize()
     {
         int size = 0;

         if (holder.staticRow != null)
         {
             for (ColumnData cd : holder.staticRow.columnData())
             {
                 size += cd.dataSize();
             }
         }

         for (Row row : this)
         {
             size += row.clustering().dataSize();
             for (ColumnData cd : row)
                 size += cd.dataSize();
         }
         return size;
     }

     @Override
     public PartitionColumns columns()
     {
         // The superclass implementation calls holder(), but that triggers a build of the PartitionUpdate. But since
         // the columns are passed to the ctor, we know the holder always has the proper columns even if it doesn't have
         // the built rows yet, so just bypass the holder() method.
         return holder.columns;
     }

     protected Holder holder()
     {
         maybeBuild();
         return holder;
     }

     public EncodingStats stats()
     {
         return holder().stats;
     }

     /**
      * If a partition update has been read (and is thus unmodifiable), a call to this method
      * makes the update modifiable again.
      * <p>
      * Please note that calling this method won't result in optimal behavior in the sense that
      * even if very little is added to the update after this call, the whole update will be sorted
      * again on read. This should thus be used sparingly (and if it turns that we end up using
      * this often, we should consider optimizing the behavior).
      */
     public synchronized void allowNewUpdates()
     {
         if (!canReOpen)
             throw new IllegalStateException("You cannot do more updates on collectCounterMarks has been called");

         // This is synchronized to make extra sure things work properly even if this is
         // called concurrently with sort() (which should be avoided in the first place, but
         // better safe than sorry).
         isBuilt = false;
         if (rowBuilder == null)
             rowBuilder = builder(16);
     }

     private BTree.Builder<Row> builder(int initialCapacity)
     {
         return BTree.<Row>builder(metadata.comparator, initialCapacity)
                     .setQuickResolver((a, b) ->
                                       Rows.merge(a, b, createdAtInSec));
     }

     /**
      * Returns an iterator that iterates over the rows of this update in clustering order.
      * <p>
      * Note that this might trigger a sorting of the update, and as such the update will not
      * be modifiable anymore after this call.
      *
      * @return an iterator over the rows of this update.
      */
     @Override
     public Iterator<Row> iterator()
     {
         maybeBuild();
         return super.iterator();
     }

     /**
      * Validates the data contained in this update.
      *
      * @throws org.apache.cassandra.serializers.MarshalException if some of the data contained in this update is corrupted.
      */
     public void validate()
     {
         for (Row row : this)
         {
             metadata().comparator.validate(row.clustering());
             for (ColumnData cd : row)
                 cd.validate();
         }
     }

     /**
      * The maximum timestamp used in this update.
      *
      * @return the maximum timestamp used in this update.
      */
     public long maxTimestamp()
     {
         maybeBuild();

         long maxTimestamp = deletionInfo.maxTimestamp();
         for (Row row : this)
         {
             maxTimestamp = Math.max(maxTimestamp, row.primaryKeyLivenessInfo().timestamp());
             for (ColumnData cd : row)
             {
                 if (cd.column().isSimple())
                 {
                     maxTimestamp = Math.max(maxTimestamp, ((Cell)cd).timestamp());
                 }
                 else
                 {
                     ComplexColumnData complexData = (ComplexColumnData)cd;
                     maxTimestamp = Math.max(maxTimestamp, complexData.complexDeletion().markedForDeleteAt());
                     for (Cell cell : complexData)
                         maxTimestamp = Math.max(maxTimestamp, cell.timestamp());
                 }
             }
         }

         if (holder.staticRow != null)
         {
             for (ColumnData cd : holder.staticRow.columnData())
             {
                 if (cd.column().isSimple())
                 {
                     maxTimestamp = Math.max(maxTimestamp, ((Cell) cd).timestamp());
                 }
                 else
                 {
                     ComplexColumnData complexData = (ComplexColumnData) cd;
                     maxTimestamp = Math.max(maxTimestamp, complexData.complexDeletion().markedForDeleteAt());
                     for (Cell cell : complexData)
                         maxTimestamp = Math.max(maxTimestamp, cell.timestamp());
                 }
             }
         }
         return maxTimestamp;
     }

     /**
      * For an update on a counter table, returns a list containing a {@code CounterMark} for
      * every counter contained in the update.
      *
      * @return a list with counter marks for every counter in this update.
      */
     public List<CounterMark> collectCounterMarks()
     {
         assert metadata().isCounter();
         maybeBuild();
         // We will take aliases on the rows of this update, and update them in-place. So we should be sure the
         // update is now immutable for all intent and purposes.
         canReOpen = false;

         List<CounterMark> marks = new ArrayList<>();
         addMarksForRow(staticRow(), marks);
         for (Row row : this)
             addMarksForRow(row, marks);
         return marks;
     }

     private void addMarksForRow(Row row, List<CounterMark> marks)
     {
         for (Cell cell : row.cells())
         {
             if (cell.isCounterCell())
                 marks.add(new CounterMark(row, cell.column(), cell.path()));
         }
     }

     private void assertNotBuilt()
     {
         if (isBuilt)
             throw new IllegalStateException("An update should not be written again once it has been read");
     }

     public void addPartitionDeletion(DeletionTime deletionTime)
     {
         assertNotBuilt();
         deletionInfo.add(deletionTime);
     }

     public void add(RangeTombstone range)
     {
         assertNotBuilt();
         deletionInfo.add(range, metadata.comparator);
     }

     /**
      * Adds a row to this update.
      *
      * There is no particular assumption made on the order of row added to a partition update. It is further
      * allowed to add the same row (more precisely, multiple row objects for the same clustering).
      *
      * Note however that the columns contained in the added row must be a subset of the columns used when
      * creating this update.
      *
      * @param row the row to add.
      */
     public void add(Row row)
     {
         if (row.isEmpty())
             return;

         assertNotBuilt();

         if (row.isStatic())
         {
             // this assert is expensive, and possibly of limited value; we should consider removing it
             // or introducing a new class of assertions for test purposes
             assert columns().statics.containsAll(row.columns()) : columns().statics + " is not superset of " + row.columns();
             Row staticRow = holder.staticRow.isEmpty()
                       ? row
                       : Rows.merge(holder.staticRow, row, createdAtInSec);
             holder = new Holder(holder.columns, holder.tree, holder.deletionInfo, staticRow, holder.stats);
         }
         else
         {
             // this assert is expensive, and possibly of limited value; we should consider removing it
             // or introducing a new class of assertions for test purposes
             assert columns().regulars.containsAll(row.columns()) : columns().regulars + " is not superset of " + row.columns();
             rowBuilder.add(row);
         }
     }

     private void maybeBuild()
     {
         if (isBuilt)
             return;

         build();
     }

     private synchronized void build()
     {
         if (isBuilt)
             return;

         Holder holder = this.holder;
         Object[] cur = holder.tree;
         Object[] add = rowBuilder.build();
         Object[] merged = BTree.<Row>merge(cur, add, metadata.comparator,
                                            UpdateFunction.Simple.of((a, b) -> Rows.merge(a, b, createdAtInSec)));

         assert deletionInfo == holder.deletionInfo;
         EncodingStats newStats = EncodingStats.Collector.collect(holder.staticRow, BTree.<Row>iterator(merged), deletionInfo);

         this.holder = new Holder(holder.columns, merged, holder.deletionInfo, holder.staticRow, newStats);
         rowBuilder = null;
         isBuilt = true;
     }

     @Override
     public String toString()
     {
         if (isBuilt)
             return super.toString();

         // We intentionally override AbstractBTreePartition#toString() to avoid iterating over the rows in the
         // partition, which can result in build() being triggered and lead to errors if the PartitionUpdate is later
         // modified.

         StringBuilder sb = new StringBuilder();
         sb.append(String.format("[%s.%s] key=%s columns=%s",
                                 metadata.ksName,
                                 metadata.cfName,
                                 metadata.getKeyValidator().getString(partitionKey().getKey()),
                                 columns()));

         sb.append("\n    deletionInfo=").append(deletionInfo);
         sb.append(" (not built)");
         return sb.toString();
     }

     /**
      * Creates a new simple partition update builder.
      *
      * @param metadata the metadata for the table this is a partition of.
      * @param partitionKeyValues the values for partition key columns identifying this partition. The values for each
      * partition key column can be passed either directly as {@code ByteBuffer} or using a "native" value (int for
      * Int32Type, string for UTF8Type, ...). It is also allowed to pass a single {@code DecoratedKey} value directly.
      * @return a newly created builder.
      */
     public static SimpleBuilder simpleBuilder(CFMetaData metadata, Object... partitionKeyValues)
     {
         return new SimpleBuilders.PartitionUpdateBuilder(metadata, partitionKeyValues);
     }

     /**
      * Interface for building partition updates geared towards human.
      * <p>
      * This should generally not be used when performance matters too much, but provides a more convenient interface to
      * build an update than using the class constructor when performance is not of the utmost importance.
      */
     public interface SimpleBuilder
     {
         /**
          * The metadata of the table this is a builder on.
          */
         public CFMetaData metadata();

         /**
          * Sets the timestamp to use for the following additions to this builder or any derived (row) builder.
          *
          * @param timestamp the timestamp to use for following additions. If that timestamp hasn't been set, the current
          * time in microseconds will be used.
          * @return this builder.
          */
         public SimpleBuilder timestamp(long timestamp);

         /**
          * Sets the ttl to use for the following additions to this builder or any derived (row) builder.
          *
          * @param ttl the ttl to use for following additions. If that ttl hasn't been set, no ttl will be used.
          * @return this builder.
          */
         public SimpleBuilder ttl(int ttl);

         /**
          * Sets the current time to use for the following additions to this builder or any derived (row) builder.
          *
          * @param nowInSec the current time to use for following additions. If the current time hasn't been set, the current
          * time in seconds will be used.
          * @return this builder.
          */
         public SimpleBuilder nowInSec(int nowInSec);

         /**
          * Adds the row identifier by the provided clustering and return a builder for that row.
          *
          * @param clusteringValues the value for the clustering columns of the row to add to this build. There may be no
          * values if either the table has no clustering column, or if you want to edit the static row. Note that as a
          * shortcut it is also allowed to pass a {@code Clustering} object directly, in which case that should be the
          * only argument.
          * @return a builder for the row identified by {@code clusteringValues}.
          */
         public Row.SimpleBuilder row(Object... clusteringValues);

         /**
          * Deletes the partition identified by this builder (using a partition level deletion).
          *
          * @return this builder.
          */
         public SimpleBuilder delete();

         /**
          * Adds a new range tombstone to this update, returning a builder for that range.
          *
          * @return the range tombstone builder for the newly added range.
          */
         public RangeTombstoneBuilder addRangeTombstone();

         /**
          * Build the update represented by this builder.
          *
          * @return the built update.
          */
         public PartitionUpdate build();

         /**
          * As shortcut for {@code new Mutation(build())}.
          *
          * @return the built update, wrapped in a {@code Mutation}.
          */
         public Mutation buildAsMutation();

         /**
          * Interface to build range tombstone.
          *
          * By default, if no other methods are called, the represented range is inclusive of both start and end and
          * includes everything (its start is {@code BOTTOM} and it's end is {@code TOP}).
          */
         public interface RangeTombstoneBuilder
         {
             /**
              * Sets the start for the built range using the provided values.
              *
              * @param values the value for the start of the range. They act like the {@code clusteringValues} argument
              * of the {@link SimpleBuilder#row(Object...)} method, except that it doesn't have to be a full
              * clustering, it can only be a prefix.
              * @return this builder.
              */
             public RangeTombstoneBuilder start(Object... values);

             /**
              * Sets the end for the built range using the provided values.
              *
              * @param values the value for the end of the range. They act like the {@code clusteringValues} argument
              * of the {@link SimpleBuilder#row(Object...)} method, except that it doesn't have to be a full
              * clustering, it can only be a prefix.
              * @return this builder.
              */
             public RangeTombstoneBuilder end(Object... values);

             /**
              * Sets the start of this range as inclusive.
              * <p>
              * This is the default and don't need to be called, but can for explicitness.
              *
              * @return this builder.
              */
             public RangeTombstoneBuilder inclStart();

             /**
              * Sets the start of this range as exclusive.
              *
              * @return this builder.
              */
             public RangeTombstoneBuilder exclStart();

             /**
              * Sets the end of this range as inclusive.
              * <p>
              * This is the default and don't need to be called, but can for explicitness.
              *
              * @return this builder.
              */
             public RangeTombstoneBuilder inclEnd();

             /**
              * Sets the end of this range as exclusive.
              *
              * @return this builder.
              */
             public RangeTombstoneBuilder exclEnd();
         }
     }

     public static class PartitionUpdateSerializer
     {
         public void serialize(PartitionUpdate update, DataOutputPlus out, int version) throws IOException
         {
             try (UnfilteredRowIterator iter = update.unfilteredIterator())
             {
                 assert !iter.isReverseOrder();

                 if (version < MessagingService.VERSION_30)
                 {
                     LegacyLayout.serializeAsLegacyPartition(null, iter, out, version);
                 }
                 else
                 {
                     CFMetaData.serializer.serialize(update.metadata(), out, version);
                     UnfilteredRowIteratorSerializer.serializer.serialize(iter, null, out, version, update.rowCount());
                 }
             }
         }

         public PartitionUpdate deserialize(DataInputPlus in, int version, SerializationHelper.Flag flag, ByteBuffer key) throws IOException
         {
             if (version >= MessagingService.VERSION_30)
             {
                 assert key == null; // key is only there for the old format
                 return deserialize30(in, version, flag);
             }
             else
             {
                 assert key != null;
                 return deserializePre30(in, version, flag, key);
             }
         }

         // Used to share same decorated key between updates.
         public PartitionUpdate deserialize(DataInputPlus in, int version, SerializationHelper.Flag flag, DecoratedKey key) throws IOException
         {
             if (version >= MessagingService.VERSION_30)
             {
                 return deserialize30(in, version, flag);
             }
             else
             {
                 assert key != null;
                 return deserializePre30(in, version, flag, key.getKey());
             }
         }

         private static PartitionUpdate deserialize30(DataInputPlus in, int version, SerializationHelper.Flag flag) throws IOException
         {
             CFMetaData metadata = CFMetaData.serializer.deserialize(in, version);
             UnfilteredRowIteratorSerializer.Header header = UnfilteredRowIteratorSerializer.serializer.deserializeHeader(metadata, null, in, version, flag);
             if (header.isEmpty)
                 return emptyUpdate(metadata, header.key);

             assert !header.isReversed;
             assert header.rowEstimate >= 0;

             MutableDeletionInfo.Builder deletionBuilder = MutableDeletionInfo.builder(header.partitionDeletion, metadata.comparator, false);
             BTree.Builder<Row> rows = BTree.builder(metadata.comparator, header.rowEstimate);
             rows.auto(false);

             try (UnfilteredRowIterator partition = UnfilteredRowIteratorSerializer.serializer.deserialize(in, version, metadata, flag, header))
             {
                 while (partition.hasNext())
                 {
                     Unfiltered unfiltered = partition.next();
                     if (unfiltered.kind() == Unfiltered.Kind.ROW)
                         rows.add((Row)unfiltered);
                     else
                         deletionBuilder.add((RangeTombstoneMarker)unfiltered);
                 }
             }

             MutableDeletionInfo deletionInfo = deletionBuilder.build();
             return new PartitionUpdate(metadata,
                                        header.key,
                                        new Holder(header.sHeader.columns(), rows.build(), deletionInfo, header.staticRow, header.sHeader.stats()),
                                        deletionInfo,
                                        false);
         }

         private static PartitionUpdate deserializePre30(DataInputPlus in, int version, SerializationHelper.Flag flag, ByteBuffer key) throws IOException
         {
             try (UnfilteredRowIterator iterator = LegacyLayout.deserializeLegacyPartition(in, version, flag, key))
             {
                 assert iterator != null; // This is only used in mutation, and mutation have never allowed "null" column families
                 return PartitionUpdate.fromPre30Iterator(iterator, ColumnFilter.all(iterator.metadata()));
             }
         }

         public long serializedSize(PartitionUpdate update, int version)
         {
             try (UnfilteredRowIterator iter = update.unfilteredIterator())
             {
                 if (version < MessagingService.VERSION_30)
                     return LegacyLayout.serializedSizeAsLegacyPartition(null, iter, version);

                 return CFMetaData.serializer.serializedSize(update.metadata(), version)
                      + UnfilteredRowIteratorSerializer.serializer.serializedSize(iter, null, version, update.rowCount());
             }
         }
     }

     /**
      * A counter mark is basically a pointer to a counter update inside this partition update. That pointer allows
      * us to update the counter value based on the pre-existing value read during the read-before-write that counters
      * do. See {@link CounterMutation} to understand how this is used.
      */
     public static class CounterMark
     {
         private final Row row;
         private final ColumnDefinition column;
         private final CellPath path;

         private CounterMark(Row row, ColumnDefinition column, CellPath path)
         {
             this.row = row;
             this.column = column;
             this.path = path;
         }

         public Clustering clustering()
         {
             return row.clustering();
         }

         public ColumnDefinition column()
         {
             return column;
         }

         public CellPath path()
         {
             return path;
         }

         public ByteBuffer value()
         {
             return path == null
                  ? row.getCell(column).value()
                  : row.getCell(column, path).value();
         }

         public void setValue(ByteBuffer value)
         {
             // This is a bit of a giant hack as this is the only place where we mutate a Row object. This makes it more efficient
             // for counters however and this won't be needed post-#6506 so that's probably fine.
             assert row instanceof BTreeRow;
             ((BTreeRow)row).setValue(column, path, value);
         }
     }
 }