blob: 3a18a8c7d7b0fc3e15a9f9b619b363f4ed17019e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db.memtable;
import java.util.concurrent.atomic.AtomicReference;
import javax.annotation.concurrent.NotThreadSafe;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.PartitionPosition;
import org.apache.cassandra.db.RegularAndStaticColumns;
import org.apache.cassandra.db.commitlog.CommitLogPosition;
import org.apache.cassandra.db.partitions.Partition;
import org.apache.cassandra.db.partitions.PartitionUpdate;
import org.apache.cassandra.db.rows.EncodingStats;
import org.apache.cassandra.db.rows.UnfilteredSource;
import org.apache.cassandra.index.transactions.UpdateTransaction;
import org.apache.cassandra.io.sstable.format.SSTableWriter;
import org.apache.cassandra.metrics.TableMetrics;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.schema.TableMetadataRef;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.concurrent.Future;
import org.apache.cassandra.utils.concurrent.OpOrder;
/**
* Memtable interface. This defines the operations the ColumnFamilyStore can perform with memtables.
* They are of several types:
* - construction factory interface
* - write and read operations: put, rowIterator and partitionIterator
* - statistics and features, including partition counts, data size, encoding stats, written columns
* - memory usage tracking, including methods of retrieval and of adding extra allocated space (used non-CFS secondary
* indexes)
* - flush functionality, preparing the set of partitions to flush for given ranges
* - lifecycle management, i.e. operations that prepare and execute switch to a different memtable, together
* with ways of tracking the affected commit log spans
*
* See Memtable_API.md for details on implementing and using alternative memtable implementations.
*/
public interface Memtable extends Comparable<Memtable>, UnfilteredSource
{
public static final long NO_MIN_TIMESTAMP = -1;
// Construction
/**
* Factory interface for constructing memtables, and querying write durability features.
*
* The factory is chosen using the MemtableParams class (passed as argument to
* {@code CREATE TABLE ... WITH memtable = '<configuration_name>'} where the configuration definition is a map given
* under {@code memtable_configurations} in cassandra.yaml). To make that possible, implementations must provide
* either a static {@code FACTORY} field (if they accept no further option) or a static
* {@code factory(Map<String, String>)} method. In the latter case, the method should avoid creating
* multiple instances of the factory for the same parameters, or factories should at least implement hashCode and
* equals.
*/
interface Factory
{
/**
* Create a memtable.
*
* @param commitLogLowerBound A commit log lower bound for the new memtable. This will be equal to the previous
* memtable's upper bound and defines the span of positions that any flushed sstable
* will cover.
* @param metadaRef Pointer to the up-to-date table metadata.
* @param owner Owning objects that will receive flush requests triggered by the memtable (e.g. on expiration).
*/
Memtable create(AtomicReference<CommitLogPosition> commitLogLowerBound, TableMetadataRef metadaRef, Owner owner);
/**
* If the memtable can achieve write durability directly (i.e. using some feature other than the commitlog, e.g.
* persistent memory), it can return true here, in which case the commit log will not store mutations in this
* table.
* Note that doing so will prevent point-in-time restores and changed data capture, thus a durable memtable must
* allow the option of turning commit log writing on even if it does not need it.
*/
default boolean writesShouldSkipCommitLog()
{
return false;
}
/**
* This should be true if the memtable can achieve write durability for crash recovery directly (i.e. using some
* feature other than the commitlog, e.g. persistent memory).
* Setting this flag to true means that the commitlog should not replay mutations for this table on restart,
* and that it should not try to preserve segments that contain relevant data.
* Unless writesShouldSkipCommitLog() is also true, writes will be recorded in the commit log as they may be
* needed for changed data capture or point-in-time restore.
*/
default boolean writesAreDurable()
{
return false;
}
/**
* Normally we can receive streamed sstables directly, skipping the memtable stage (zero-copy-streaming). When
* the memtable is the primary data store (e.g. persistent memtables), it will usually prefer to receive the
* data instead.
*
* If this returns true, all streamed sstables's content will be read and replayed as mutations, disabling
* zero-copy streaming.
*/
default boolean streamToMemtable()
{
return false;
}
/**
* When we need to stream data, we usually flush and stream the resulting sstables. This will not work correctly
* if the memtable does not want to flush for streaming (e.g. persistent memtables acting as primary data
* store), because data (not just recent) will be missing from the streamed view. Such memtables must present
* their data separately for streaming.
* In other words if the memtable returns false on shouldSwitch(STREAMING/REPAIR), its factory must return true
* here.
*
* If this flag returns true, streaming will write the relevant content that resides in the memtable to
* temporary sstables, stream these sstables and then delete them.
*/
default boolean streamFromMemtable()
{
return false;
}
/**
* Override this method to include implementation-specific memtable metrics in the table metrics.
*
* Memtable metrics lifecycle matches table lifecycle. It is the table that owns the metrics and
* decides when to release them.
*/
default TableMetrics.ReleasableMetric createMemtableMetrics(TableMetadataRef metadataRef)
{
return null;
}
}
/**
* Interface for providing signals back and requesting information from the owner, i.e. the object that controls the
* memtable. This is usually the ColumnFamilyStore; the interface is used to limit the dependency of memtables on
* the details of its implementation.
*/
interface Owner
{
/** Signal to the owner that a flush is required (e.g. in response to hitting space limits) */
Future<CommitLogPosition> signalFlushRequired(Memtable memtable, ColumnFamilyStore.FlushReason reason);
/** Get the current memtable for this owner. Used to avoid capturing memtable in scheduled flush tasks. */
Memtable getCurrentMemtable();
/**
* Collect the index memtables flushed together with this. Used to accurately calculate memory that would be
* freed by a flush.
*/
Iterable<Memtable> getIndexMemtables();
/**
* Construct a list of boundaries that split the locally-owned ranges into the given number of shards,
* splitting the owned space evenly. It is up to the memtable to use this information.
* Any changes in the ring structure (e.g. added or removed nodes) will invalidate the splits; in such a case
* the memtable will be sent a {@link #shouldSwitch}(OWNED_RANGES_CHANGE) and, should that return false, a
* {@link #localRangesUpdated()} call.
*/
ShardBoundaries localRangeSplits(int shardCount);
}
// Main write and read operations
/**
* Put new data in the memtable. This operation may block until enough memory is available in the memory pool.
*
* @param update the partition update, may be a new partition or an update to an existing one
* @param indexer receives information about the update's effect
* @param opGroup write operation group, used to permit the operation to complete if it is needed to complete a
* flush to free space.
*
* @return the smallest timestamp delta between corresponding rows from existing and update. A
* timestamp delta being computed as the difference between the cells and DeletionTimes from any existing partition
* and those in {@code update}. See CASSANDRA-7979.
*/
long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup);
// Read operations are provided by the UnfilteredSource interface.
// Statistics
/** Number of partitions stored in the memtable */
long partitionCount();
/** Size of the data not accounting for any metadata / mapping overheads */
long getLiveDataSize();
/**
* Number of "operations" (in the sense defined in {@link PartitionUpdate#operationCount()}) the memtable has
* executed.
*/
long operationCount();
/**
* The table's definition metadata.
*
* Note that this tracks the current state of the table and is not necessarily the same as what was used to create
* the memtable.
*/
TableMetadata metadata();
// Memory usage tracking
/**
* Add this memtable's used memory to the given usage object. This can be used to retrieve a single memtable's usage
* as well as to combine the ones of related sstables (e.g. a table and its table-based secondary indexes).
*/
void addMemoryUsageTo(MemoryUsage usage);
/**
* Creates a holder for memory usage collection.
*
* This is used to track on- and off-heap memory, as well as the ratio to the total permitted memtable memory.
*/
static MemoryUsage newMemoryUsage()
{
return new MemoryUsage();
}
/**
* Shorthand for the getting a given table's memory usage.
* Implemented as a static to prevent implementations altering expectations by e.g. returning a cached object.
*/
static MemoryUsage getMemoryUsage(Memtable memtable)
{
MemoryUsage usage = newMemoryUsage();
memtable.addMemoryUsageTo(usage);
return usage;
}
@NotThreadSafe
class MemoryUsage
{
/** On-heap memory used in bytes */
public long ownsOnHeap = 0;
/** Off-heap memory used in bytes */
public long ownsOffHeap = 0;
/** On-heap memory as ratio to permitted memtable space */
public float ownershipRatioOnHeap = 0.0f;
/** Off-heap memory as ratio to permitted memtable space */
public float ownershipRatioOffHeap = 0.0f;
@Override
public String toString()
{
return String.format("%s (%.0f%%) on-heap, %s (%.0f%%) off-heap",
FBUtilities.prettyPrintMemory(ownsOnHeap),
ownershipRatioOnHeap * 100,
FBUtilities.prettyPrintMemory(ownsOffHeap),
ownershipRatioOffHeap * 100);
}
}
/**
* Adjust the used on-heap space by the given size (e.g. to reflect memory used by a non-table-based index).
* This operation may block until enough memory is available in the memory pool.
*
* @param additionalSpace the number of allocated bytes
* @param opGroup write operation group, used to permit the operation to complete if it is needed to complete a
* flush to free space.
*/
void markExtraOnHeapUsed(long additionalSpace, OpOrder.Group opGroup);
/**
* Adjust the used off-heap space by the given size (e.g. to reflect memory used by a non-table-based index).
* This operation may block until enough memory is available in the memory pool.
*
* @param additionalSpace the number of allocated bytes
* @param opGroup write operation group, used to permit the operation to complete if it is needed to complete a
* flush to free space.
*/
void markExtraOffHeapUsed(long additionalSpace, OpOrder.Group opGroup);
// Flushing
/**
* Get the collection of data between the given partition boundaries in a form suitable for flushing.
*/
FlushablePartitionSet<?> getFlushSet(PartitionPosition from, PartitionPosition to);
/**
* A collection of partitions for flushing plus some information required for writing an sstable.
*
* Note that the listed entries must conform with the specified metadata. In particular, if the memtable is still
* being written to, care must be taken to not list newer items as they may violate the bounds collected by the
* encoding stats or refer to columns that don't exist in the collected columns set.
*/
interface FlushablePartitionSet<P extends Partition> extends Iterable<P>, SSTableWriter.SSTableSizeParameters
{
Memtable memtable();
PartitionPosition from();
PartitionPosition to();
/** The commit log position at the time that this memtable was created */
CommitLogPosition commitLogLowerBound();
/** The commit log position at the time that this memtable was switched out */
CommitLogPosition commitLogUpperBound();
/** The set of all columns that have been written */
RegularAndStaticColumns columns();
/** Statistics required for writing an sstable efficiently */
EncodingStats encodingStats();
default TableMetadata metadata()
{
return memtable().metadata();
}
default boolean isEmpty()
{
return partitionCount() > 0;
}
}
// Lifecycle management
/**
* Called to tell the memtable that it is being switched out and will be flushed (or dropped) and discarded.
* Will be followed by a {@link #getFlushSet} call (if the table is not truncated or dropped), and a
* {@link #discard}.
*
* @param writeBarrier The barrier that will signal that all writes to this memtable have completed. That is, the
* point after which writes cannot be accepted by this memtable (it is permitted for writes
* before this barrier to go into the next; see {@link #accepts}).
* @param commitLogUpperBound The upper commit log position for this memtable. The value may be modified after this
* call and will match the next memtable's lower commit log bound.
*/
void switchOut(OpOrder.Barrier writeBarrier, AtomicReference<CommitLogPosition> commitLogUpperBound);
/**
* This memtable is no longer in use or required for outstanding flushes or operations.
* All held memory must be released.
*/
void discard();
/**
* Decide if this memtable should take a write with the given parameters, or if the write should go to the next
* memtable. This enforces that no writes after the barrier set by {@link #switchOut} can be accepted, and
* is also used to define a shared commit log bound as the upper for this memtable and lower for the next.
*/
boolean accepts(OpOrder.Group opGroup, CommitLogPosition commitLogPosition);
/** Approximate commit log lower bound, <= getCommitLogLowerBound, used as a time stamp for ordering */
CommitLogPosition getApproximateCommitLogLowerBound();
/** The commit log position at the time that this memtable was created */
CommitLogPosition getCommitLogLowerBound();
/** The commit log position at the time that this memtable was switched out */
LastCommitLogPosition getFinalCommitLogUpperBound();
/** True if the memtable can contain any data that was written before the given commit log position */
boolean mayContainDataBefore(CommitLogPosition position);
/** True if the memtable contains no data */
boolean isClean();
/** Order memtables by time as reflected in the commit log position at time of construction */
default int compareTo(Memtable that)
{
return this.getApproximateCommitLogLowerBound().compareTo(that.getApproximateCommitLogLowerBound());
}
/**
* Decides whether the memtable should be switched/flushed for the passed reason.
* Normally this will return true, but e.g. persistent memtables may choose not to flush. Returning false will
* trigger further action for some reasons:
* - SCHEMA_CHANGE will be followed by metadataUpdated().
* - OWNED_RANGES_CHANGE will be followed by localRangesUpdated().
* - SNAPSHOT will be followed by performSnapshot().
* - STREAMING/REPAIR will be followed by creating a FlushSet for the streamed/repaired ranges. This data will be
* used to create sstables, which will be streamed and then deleted.
* This will not be called to perform truncation or drop (in that case the memtable is unconditionally dropped),
* but a flush may nevertheless be requested in that case to prepare a snapshot.
*/
boolean shouldSwitch(ColumnFamilyStore.FlushReason reason);
/**
* Called when the table's metadata is updated. The memtable's metadata reference now points to the new version.
* This will not be called if {@link #shouldSwitch)(SCHEMA_CHANGE) returns true, as the memtable will be swapped out
* instead.
*/
void metadataUpdated();
/**
* Called when the known ranges have been updated and owner.localRangeSplits() may return different values.
* This will not be called if {@link #shouldSwitch)(OWNED_RANGES_CHANGE) returns true, as the memtable will be
* swapped out instead.
*/
void localRangesUpdated();
/**
* If the memtable needs to do some special action for snapshots (e.g. because it is persistent and does not want
* to flush), it should return false on the above with reason SNAPSHOT and implement this method.
*/
void performSnapshot(String snapshotName);
/**
* Special commit log position marker used in the upper bound marker setting process
* (see {@link org.apache.cassandra.db.ColumnFamilyStore#setCommitLogUpperBound} and {@link AbstractMemtable#accepts})
*/
public static final class LastCommitLogPosition extends CommitLogPosition
{
public LastCommitLogPosition(CommitLogPosition copy)
{
super(copy.segmentId, copy.position);
}
}
}